Skip to main content

url_normalize/
lib.rs

1mod data_url;
2mod encode;
3mod options;
4mod path;
5mod query;
6mod url_parser;
7
8pub use options::*;
9
10use std::fmt;
11
12/// Errors that can occur during URL normalization.
13#[derive(Debug)]
14pub enum NormalizeUrlError {
15    /// The URL string could not be parsed.
16    InvalidUrl(String),
17    /// `force_http` and `force_https` cannot be used together.
18    ConflictingOptions,
19}
20
21impl fmt::Display for NormalizeUrlError {
22    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
23        match self {
24            Self::InvalidUrl(url) => write!(f, "Invalid URL: {url}"),
25            Self::ConflictingOptions => {
26                write!(
27                    f,
28                    "The `force_http` and `force_https` options cannot be used together"
29                )
30            }
31        }
32    }
33}
34
35impl std::error::Error for NormalizeUrlError {}
36
37/// Normalize a URL string according to the provided options.
38///
39/// # Examples
40///
41/// ```
42/// use url_normalize::{normalize_url, Options};
43///
44/// let result = normalize_url("https://www.example.com/foo/", &Options::default()).unwrap();
45/// assert_eq!(result, "https://example.com/foo");
46/// ```
47pub fn normalize_url(url_string: &str, options: &Options) -> Result<String, NormalizeUrlError> {
48    let mut url_string = url_string.trim().to_string();
49
50    if options.force_http && options.force_https {
51        return Err(NormalizeUrlError::ConflictingOptions);
52    }
53
54    // Data URL
55    if url_string
56        .get(..5)
57        .map(|s| s.eq_ignore_ascii_case("data:"))
58        .unwrap_or(false)
59    {
60        return data_url::normalize_data_url(&url_string, options.strip_hash);
61    }
62
63    // Custom protocol detection
64    let custom_protocol = detect_custom_protocol(&url_string);
65    let normalized_custom_protocols: Vec<String> = options
66        .custom_protocols
67        .iter()
68        .filter_map(|p| {
69            let p = p.trim().to_lowercase();
70            let p = p.strip_suffix(':').unwrap_or(&p).to_string();
71            if p.is_empty() {
72                None
73            } else {
74                Some(format!("{p}:"))
75            }
76        })
77        .collect();
78
79    if let Some(ref cp) = custom_protocol {
80        if !normalized_custom_protocols.iter().any(|p| p == cp) {
81            return Ok(url_string);
82        }
83    }
84
85    let has_relative_protocol = url_string.starts_with("//");
86    let is_relative_url =
87        !has_relative_protocol && (url_string.starts_with("./") || url_string.starts_with("../"));
88
89    // Reject invalid relative paths like "/" or "/relative/path/"
90    if !is_relative_url
91        && !has_relative_protocol
92        && custom_protocol.is_none()
93        && (url_string == "/" || (url_string.starts_with('/') && !url_string.starts_with("//")))
94    {
95        return Err(NormalizeUrlError::InvalidUrl(url_string));
96    }
97
98    // Prepend protocol
99    if !is_relative_url && custom_protocol.is_none() {
100        let default_proto = match options.default_protocol {
101            Protocol::Http => "http:",
102            Protocol::Https => "https:",
103        };
104
105        if has_relative_protocol {
106            url_string = format!("{default_proto}{url_string}");
107        } else if !url_string.contains("://") {
108            url_string = format!("{default_proto}//{url_string}");
109        }
110    }
111
112    let mut parsed = url_parser::ParsedUrl::parse(&url_string)
113        .map_err(|_| NormalizeUrlError::InvalidUrl(url_string.clone()))?;
114
115    // Reject URLs with empty host (like "http://")
116    if parsed.host.is_empty() && (parsed.scheme == "http" || parsed.scheme == "https") {
117        return Err(NormalizeUrlError::InvalidUrl(url_string));
118    }
119
120    // Force HTTP / HTTPS
121    if options.force_http && parsed.scheme.eq_ignore_ascii_case("https") {
122        parsed.scheme = "http".to_string();
123    }
124    if options.force_https && parsed.scheme.eq_ignore_ascii_case("http") {
125        parsed.scheme = "https".to_string();
126    }
127
128    // Strip authentication
129    if options.strip_authentication {
130        parsed.username = String::new();
131        parsed.password = String::new();
132    }
133
134    // Strip hash
135    if options.strip_hash {
136        parsed.fragment = None;
137    } else if options.strip_text_fragment {
138        if let Some(ref mut frag) = parsed.fragment {
139            if let Some(idx) = frag.find(":~:text") {
140                if idx == 0 {
141                    *frag = String::new();
142                } else {
143                    frag.truncate(idx);
144                }
145            }
146        }
147    }
148
149    // Remove empty fragment (bare #)
150    if let Some(ref frag) = parsed.fragment {
151        if frag.is_empty() {
152            parsed.fragment = None;
153        }
154    }
155
156    // Path normalization: remove duplicate slashes
157    if !parsed.path.is_empty() {
158        parsed.path = path::remove_duplicate_slashes(&parsed.path);
159    }
160
161    // Decode URI octets in pathname
162    if !parsed.path.is_empty() {
163        parsed.path = path::decode_pathname(&parsed.path);
164    }
165
166    // Remove directory index
167    path::remove_directory_index(&mut parsed.path, &options.remove_directory_index);
168
169    // Remove path
170    if options.remove_path {
171        parsed.path = "/".to_string();
172    }
173
174    // Transform path
175    if let Some(ref transform) = options.transform_path {
176        let components: Vec<String> = parsed
177            .path
178            .split('/')
179            .filter(|s| !s.is_empty())
180            .map(|s| s.to_string())
181            .collect();
182        let new_components = transform(components);
183        if new_components.is_empty() {
184            parsed.path = "/".to_string();
185        } else {
186            parsed.path = format!("/{}", new_components.join("/"));
187        }
188    }
189
190    // Hostname normalization
191    if !parsed.host.is_empty() {
192        // Remove trailing dot
193        if parsed.host.ends_with('.') {
194            parsed.host.pop();
195        }
196
197        // Strip www.
198        if options.strip_www {
199            let host = parsed.host.to_lowercase();
200            if host.starts_with("www.")
201                && !host[4..].starts_with("www.")
202                && is_valid_www_strip(&host)
203            {
204                parsed.host = parsed.host[4..].to_string();
205            }
206        }
207    }
208
209    // Remove default ports
210    match parsed.port {
211        Some(80) if parsed.scheme.eq_ignore_ascii_case("http") => parsed.port = None,
212        Some(443) if parsed.scheme.eq_ignore_ascii_case("https") => parsed.port = None,
213        _ => {}
214    }
215
216    // Query parameter operations
217    let original_query = parsed.query.clone();
218    query::process_query(&mut parsed, options, original_query.as_deref());
219
220    // Remove trailing slash
221    if options.remove_trailing_slash && parsed.path.ends_with('/') && parsed.path.len() > 1 {
222        parsed.path.pop();
223    }
224
225    // Remove explicit port
226    if options.remove_explicit_port {
227        parsed.port = None;
228    }
229
230    // Build URL string
231    let old_url_string = url_string.clone();
232    url_string = parsed.to_string();
233
234    // Single slash handling
235    if !options.remove_single_slash
236        && parsed.path == "/"
237        && !old_url_string.ends_with('/')
238        && parsed.fragment.is_none()
239        && url_string.ends_with('/')
240    {
241        url_string.pop();
242    }
243
244    // Remove ending `/`
245    if (options.remove_trailing_slash || parsed.path == "/")
246        && parsed.fragment.is_none()
247        && options.remove_single_slash
248        && url_string.ends_with('/')
249    {
250        url_string.pop();
251    }
252
253    // Restore relative protocol
254    if has_relative_protocol && !options.normalize_protocol {
255        if let Some(rest) = url_string.strip_prefix("http://") {
256            url_string = format!("//{rest}");
257        }
258    }
259
260    // Strip protocol
261    if options.strip_protocol {
262        if let Some(rest) = url_string.strip_prefix("https://") {
263            url_string = rest.to_string();
264        } else if let Some(rest) = url_string.strip_prefix("http://") {
265            url_string = rest.to_string();
266        } else if let Some(rest) = url_string.strip_prefix("//") {
267            url_string = rest.to_string();
268        }
269    }
270
271    Ok(url_string)
272}
273
274fn detect_custom_protocol(url_string: &str) -> Option<String> {
275    if let Some(colon_idx) = url_string.find(':') {
276        let scheme = &url_string[..colon_idx];
277        if !scheme.is_empty()
278            && scheme.as_bytes()[0].is_ascii_alphabetic()
279            && scheme
280                .bytes()
281                .all(|b| b.is_ascii_alphanumeric() || b == b'+' || b == b'-' || b == b'.')
282        {
283            let lower_scheme = scheme.to_lowercase();
284            let has_authority = url_string
285                .get(colon_idx + 1..colon_idx + 3)
286                .map(|s| s == "//")
287                .unwrap_or(false);
288
289            if lower_scheme != "http"
290                && lower_scheme != "https"
291                && lower_scheme != "file"
292                && lower_scheme != "data"
293                && (!lower_scheme.contains('.') || has_authority)
294            {
295                return Some(format!("{lower_scheme}:"));
296            }
297        }
298    }
299    None
300}
301
302fn is_valid_www_strip(host: &str) -> bool {
303    let without_www = &host[4..];
304    if let Some(dot_idx) = without_www.find('.') {
305        let label = &without_www[..dot_idx];
306        let rest = &without_www[dot_idx + 1..];
307        if label.is_empty()
308            || label.len() > 63
309            || !label
310                .bytes()
311                .all(|b| b.is_ascii_lowercase() || b.is_ascii_digit() || b == b'-')
312        {
313            return false;
314        }
315        if rest.is_empty()
316            || rest.len() < 2
317            || rest.len() > 63
318            || !rest
319                .bytes()
320                .all(|b| b.is_ascii_lowercase() || b.is_ascii_digit() || b == b'-' || b == b'.')
321        {
322            return false;
323        }
324        true
325    } else {
326        false
327    }
328}