1mod data_url;
2mod encode;
3mod options;
4mod path;
5mod query;
6mod url_parser;
7
8pub use options::*;
9
10use std::fmt;
11
12#[derive(Debug)]
14pub enum NormalizeUrlError {
15 InvalidUrl(String),
17 ConflictingOptions,
19}
20
21impl fmt::Display for NormalizeUrlError {
22 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
23 match self {
24 Self::InvalidUrl(url) => write!(f, "Invalid URL: {url}"),
25 Self::ConflictingOptions => {
26 write!(
27 f,
28 "The `force_http` and `force_https` options cannot be used together"
29 )
30 }
31 }
32 }
33}
34
35impl std::error::Error for NormalizeUrlError {}
36
37pub fn normalize_url(url_string: &str, options: &Options) -> Result<String, NormalizeUrlError> {
48 let mut url_string = url_string.trim().to_string();
49
50 if options.force_http && options.force_https {
51 return Err(NormalizeUrlError::ConflictingOptions);
52 }
53
54 if url_string
56 .get(..5)
57 .map(|s| s.eq_ignore_ascii_case("data:"))
58 .unwrap_or(false)
59 {
60 return data_url::normalize_data_url(&url_string, options.strip_hash);
61 }
62
63 let custom_protocol = detect_custom_protocol(&url_string);
65 let normalized_custom_protocols: Vec<String> = options
66 .custom_protocols
67 .iter()
68 .filter_map(|p| {
69 let p = p.trim().to_lowercase();
70 let p = p.strip_suffix(':').unwrap_or(&p).to_string();
71 if p.is_empty() {
72 None
73 } else {
74 Some(format!("{p}:"))
75 }
76 })
77 .collect();
78
79 if let Some(ref cp) = custom_protocol {
80 if !normalized_custom_protocols.iter().any(|p| p == cp) {
81 return Ok(url_string);
82 }
83 }
84
85 let has_relative_protocol = url_string.starts_with("//");
86 let is_relative_url =
87 !has_relative_protocol && (url_string.starts_with("./") || url_string.starts_with("../"));
88
89 if !is_relative_url
91 && !has_relative_protocol
92 && custom_protocol.is_none()
93 && (url_string == "/" || (url_string.starts_with('/') && !url_string.starts_with("//")))
94 {
95 return Err(NormalizeUrlError::InvalidUrl(url_string));
96 }
97
98 if !is_relative_url && custom_protocol.is_none() {
100 let default_proto = match options.default_protocol {
101 Protocol::Http => "http:",
102 Protocol::Https => "https:",
103 };
104
105 if has_relative_protocol {
106 url_string = format!("{default_proto}{url_string}");
107 } else if !url_string.contains("://") {
108 url_string = format!("{default_proto}//{url_string}");
109 }
110 }
111
112 let mut parsed = url_parser::ParsedUrl::parse(&url_string)
113 .map_err(|_| NormalizeUrlError::InvalidUrl(url_string.clone()))?;
114
115 if parsed.host.is_empty() && (parsed.scheme == "http" || parsed.scheme == "https") {
117 return Err(NormalizeUrlError::InvalidUrl(url_string));
118 }
119
120 if options.force_http && parsed.scheme.eq_ignore_ascii_case("https") {
122 parsed.scheme = "http".to_string();
123 }
124 if options.force_https && parsed.scheme.eq_ignore_ascii_case("http") {
125 parsed.scheme = "https".to_string();
126 }
127
128 if options.strip_authentication {
130 parsed.username = String::new();
131 parsed.password = String::new();
132 }
133
134 if options.strip_hash {
136 parsed.fragment = None;
137 } else if options.strip_text_fragment {
138 if let Some(ref mut frag) = parsed.fragment {
139 if let Some(idx) = frag.find(":~:text") {
140 if idx == 0 {
141 *frag = String::new();
142 } else {
143 frag.truncate(idx);
144 }
145 }
146 }
147 }
148
149 if let Some(ref frag) = parsed.fragment {
151 if frag.is_empty() {
152 parsed.fragment = None;
153 }
154 }
155
156 if !parsed.path.is_empty() {
158 parsed.path = path::remove_duplicate_slashes(&parsed.path);
159 }
160
161 if !parsed.path.is_empty() {
163 parsed.path = path::decode_pathname(&parsed.path);
164 }
165
166 path::remove_directory_index(&mut parsed.path, &options.remove_directory_index);
168
169 if options.remove_path {
171 parsed.path = "/".to_string();
172 }
173
174 if let Some(ref transform) = options.transform_path {
176 let components: Vec<String> = parsed
177 .path
178 .split('/')
179 .filter(|s| !s.is_empty())
180 .map(|s| s.to_string())
181 .collect();
182 let new_components = transform(components);
183 if new_components.is_empty() {
184 parsed.path = "/".to_string();
185 } else {
186 parsed.path = format!("/{}", new_components.join("/"));
187 }
188 }
189
190 if !parsed.host.is_empty() {
192 if parsed.host.ends_with('.') {
194 parsed.host.pop();
195 }
196
197 if options.strip_www {
199 let host = parsed.host.to_lowercase();
200 if host.starts_with("www.")
201 && !host[4..].starts_with("www.")
202 && is_valid_www_strip(&host)
203 {
204 parsed.host = parsed.host[4..].to_string();
205 }
206 }
207 }
208
209 match parsed.port {
211 Some(80) if parsed.scheme.eq_ignore_ascii_case("http") => parsed.port = None,
212 Some(443) if parsed.scheme.eq_ignore_ascii_case("https") => parsed.port = None,
213 _ => {}
214 }
215
216 let original_query = parsed.query.clone();
218 query::process_query(&mut parsed, options, original_query.as_deref());
219
220 if options.remove_trailing_slash && parsed.path.ends_with('/') && parsed.path.len() > 1 {
222 parsed.path.pop();
223 }
224
225 if options.remove_explicit_port {
227 parsed.port = None;
228 }
229
230 let old_url_string = url_string.clone();
232 url_string = parsed.to_string();
233
234 if !options.remove_single_slash
236 && parsed.path == "/"
237 && !old_url_string.ends_with('/')
238 && parsed.fragment.is_none()
239 && url_string.ends_with('/')
240 {
241 url_string.pop();
242 }
243
244 if (options.remove_trailing_slash || parsed.path == "/")
246 && parsed.fragment.is_none()
247 && options.remove_single_slash
248 && url_string.ends_with('/')
249 {
250 url_string.pop();
251 }
252
253 if has_relative_protocol && !options.normalize_protocol {
255 if let Some(rest) = url_string.strip_prefix("http://") {
256 url_string = format!("//{rest}");
257 }
258 }
259
260 if options.strip_protocol {
262 if let Some(rest) = url_string.strip_prefix("https://") {
263 url_string = rest.to_string();
264 } else if let Some(rest) = url_string.strip_prefix("http://") {
265 url_string = rest.to_string();
266 } else if let Some(rest) = url_string.strip_prefix("//") {
267 url_string = rest.to_string();
268 }
269 }
270
271 Ok(url_string)
272}
273
274fn detect_custom_protocol(url_string: &str) -> Option<String> {
275 if let Some(colon_idx) = url_string.find(':') {
276 let scheme = &url_string[..colon_idx];
277 if !scheme.is_empty()
278 && scheme.as_bytes()[0].is_ascii_alphabetic()
279 && scheme
280 .bytes()
281 .all(|b| b.is_ascii_alphanumeric() || b == b'+' || b == b'-' || b == b'.')
282 {
283 let lower_scheme = scheme.to_lowercase();
284 let has_authority = url_string
285 .get(colon_idx + 1..colon_idx + 3)
286 .map(|s| s == "//")
287 .unwrap_or(false);
288
289 if lower_scheme != "http"
290 && lower_scheme != "https"
291 && lower_scheme != "file"
292 && lower_scheme != "data"
293 && (!lower_scheme.contains('.') || has_authority)
294 {
295 return Some(format!("{lower_scheme}:"));
296 }
297 }
298 }
299 None
300}
301
302fn is_valid_www_strip(host: &str) -> bool {
303 let without_www = &host[4..];
304 if let Some(dot_idx) = without_www.find('.') {
305 let label = &without_www[..dot_idx];
306 let rest = &without_www[dot_idx + 1..];
307 if label.is_empty()
308 || label.len() > 63
309 || !label
310 .bytes()
311 .all(|b| b.is_ascii_lowercase() || b.is_ascii_digit() || b == b'-')
312 {
313 return false;
314 }
315 if rest.is_empty()
316 || rest.len() < 2
317 || rest.len() > 63
318 || !rest
319 .bytes()
320 .all(|b| b.is_ascii_lowercase() || b.is_ascii_digit() || b == b'-' || b == b'.')
321 {
322 return false;
323 }
324 true
325 } else {
326 false
327 }
328}