1use derive_builder::Builder;
23use fancy_regex::Regex;
24use lazy_static::lazy_static;
25use std::iter::Peekable;
26use thiserror::Error;
27use url::Url;
28use urlencoding::decode;
29
30struct SkipLastIterator<I: Iterator>(Peekable<I>);
31impl<I: Iterator> Iterator for SkipLastIterator<I> {
32 type Item = I::Item;
33 fn next(&mut self) -> Option<Self::Item> {
34 let item = self.0.next();
35 match self.0.peek() {
36 Some(_) => Some(item.unwrap()),
37 None => None,
38 }
39 }
40}
41trait SkipLast: Iterator + Sized {
42 fn skip_last(self) -> SkipLastIterator<Self> {
43 SkipLastIterator(self.peekable())
44 }
45}
46impl<I: Iterator> SkipLast for I {}
47
48#[derive(Debug, Clone)]
49pub enum RemoveQueryParametersOptions {
51 None,
53 All,
55 List(Vec<Regex>),
57}
58
59#[derive(Debug, Clone)]
60pub enum RemoveDirectoryIndexOptions {
62 None,
64 Default,
66 List(Vec<Regex>),
68}
69
70#[derive(Builder, Debug, Clone)]
71#[builder(setter(into))]
72pub struct Options {
74 #[builder(default = "\"http\".to_string()")]
75 pub default_protocol: String,
79 #[builder(default = "true")]
83 pub normalize_protocol: bool,
84 #[builder(default = "false")]
88 pub force_http: bool,
89 #[builder(default = "false")]
95 pub force_https: bool,
96 #[builder(default = "true")]
100 pub strip_authentication: bool,
101 #[builder(default = "false")]
105 pub strip_hash: bool,
106 #[builder(default = "false")]
112 pub strip_protocol: bool,
113 #[builder(default = "true")]
119 pub strip_text_fragment: bool,
120 #[builder(default = "true")]
124 pub strip_www: bool,
125 #[builder(
129 default = "RemoveQueryParametersOptions::List(vec![Regex::new(r\"^utm_\\w+\").unwrap()])"
130 )]
131 pub remove_query_parameters: RemoveQueryParametersOptions,
132 #[builder(default = "None")]
138 pub keep_query_parameters: Option<Vec<Regex>>,
139 #[builder(default = "true")]
145 pub remove_trailing_slash: bool,
146 #[builder(default = "true")]
150 pub remove_single_slash: bool,
151 #[builder(default = "RemoveDirectoryIndexOptions::None")]
155 pub remove_directory_index: RemoveDirectoryIndexOptions,
156 #[builder(default = "false")]
162 pub remove_explicit_port: bool,
163 #[builder(default = "true")]
167 pub sort_query_parameters: bool,
168}
169
170#[derive(Error, Debug)]
171pub enum NormalizeUrlError {
173 #[error("The `forceHttp` and `forceHttps` options cannot be used together")]
174 ForceHttpAndHttpAreExclusive,
175 #[error("Unexpected error returned by `Url` library")]
176 URLError,
177 #[error("Unexpected error")]
178 UnexpectedError(#[from] anyhow::Error),
179}
180
181pub fn normalize_url(url: &str, options: &Options) -> Result<String, NormalizeUrlError> {
182 if options.force_http && options.force_https {
183 return Err(NormalizeUrlError::ForceHttpAndHttpAreExclusive);
184 }
185
186 let mut url_string = url.trim().to_owned();
187
188 let has_relative_protocol = url_string.starts_with("//");
199 let is_relative_url = !has_relative_protocol && {
200 lazy_static! {
201 static ref RE: Regex = Regex::new(r"^\.*\/").unwrap();
202 }
203 RE.is_match(&url_string)
204 .map_err(Into::into)
205 .map_err(NormalizeUrlError::UnexpectedError)?
206 };
207
208 if !is_relative_url {
210 lazy_static! {
211 static ref RE: Regex = Regex::new(r"^(?!(?:\w+:)?\/\/)|^\/\/").unwrap();
212 }
213 url_string = RE
214 .replace(&url_string, format!("{}://", options.default_protocol))
215 .to_string();
216 }
217
218 let mut url_obj = Url::parse(&url_string)
219 .map_err(Into::into)
220 .map_err(NormalizeUrlError::UnexpectedError)?;
221
222 if options.force_http && url_obj.scheme() == "https" {
223 url_obj
224 .set_scheme("http")
225 .map_err(|()| NormalizeUrlError::URLError)?;
226 }
227
228 if options.force_https && url_obj.scheme() == "http" {
229 url_obj
230 .set_scheme("https")
231 .map_err(|()| NormalizeUrlError::URLError)?;
232 }
233
234 if options.strip_authentication {
236 url_obj
237 .set_username("")
238 .map_err(|()| NormalizeUrlError::URLError)?;
239 url_obj
240 .set_password(None)
241 .map_err(|()| NormalizeUrlError::URLError)?;
242 }
243
244 if options.strip_hash {
246 url_obj.set_fragment(None);
247 } else if options.strip_text_fragment && url_obj.fragment().is_some() {
248 lazy_static! {
249 static ref RE: Regex = Regex::new(r"#?:~:text.*?$").unwrap();
250 }
251 let new_fragment = RE.replace(url_obj.fragment().unwrap(), "").to_string();
252 url_obj.set_fragment(match new_fragment.is_empty() {
253 true => None,
254 false => Some(&new_fragment),
255 });
256 }
257
258 if url_obj.path().len() > 0 {
260 lazy_static! {
264 static ref RE: Regex = Regex::new(r"\b[a-z][a-z\d+\-.]{1,50}:\/\/").unwrap();
265 static ref RE2: Regex = Regex::new(r"\/{2,}").unwrap();
266 }
267
268 let mut last_index = 0;
269 let mut result = "".to_string();
270 for re_match in RE.captures_iter(url_obj.path()) {
271 let re_match = re_match
272 .map_err(Into::into)
273 .map_err(NormalizeUrlError::UnexpectedError)?;
274
275 let protocol = re_match.get(0).unwrap();
276 let protocol_at_index = protocol.start();
277 let intermediate = &url_obj.path()[last_index..protocol_at_index];
278
279 result += &RE2.replace_all(intermediate, "/");
280 result += protocol.as_str();
281 last_index = protocol_at_index + protocol.as_str().len();
282 }
283
284 let remnant = &url_obj.path()[last_index..];
285 result += &RE2.replace_all(remnant, "/");
286
287 url_obj.set_path(&result);
288 }
289
290 if !url_obj.path().is_empty() {
292 let decoded_path =
293 decode(url_obj.path()).unwrap_or(std::borrow::Cow::Borrowed(url_obj.path()));
294 url_obj.set_path(&decoded_path.to_string());
295 }
296
297 let remove_directory_index_regexs = match &options.remove_directory_index {
299 RemoveDirectoryIndexOptions::None => vec![],
300 RemoveDirectoryIndexOptions::Default => vec![Regex::new(r"^index\.[a-z]+$").unwrap()],
301 RemoveDirectoryIndexOptions::List(regexs) => regexs.to_vec(),
302 };
303
304 if remove_directory_index_regexs.len() > 0 && url_obj.path_segments().is_some() {
305 let mut matched = false;
306 let path_segments = url_obj
307 .path_segments()
308 .unwrap()
309 .map(ToOwned::to_owned)
310 .collect::<Vec<_>>();
311
312 if let Some(last_path) = path_segments.last() {
313 for regex in &remove_directory_index_regexs {
314 if regex
315 .is_match(last_path)
316 .map_err(Into::into)
317 .map_err(NormalizeUrlError::UnexpectedError)?
318 {
319 matched = true;
320 break;
321 }
322 }
323 }
324
325 let it = match matched {
326 true => path_segments.iter().skip_last().collect::<Vec<_>>(),
327 false => path_segments.iter().collect(),
328 };
329
330 url_obj
331 .path_segments_mut()
332 .map_err(Into::into)
333 .map_err(|()| NormalizeUrlError::URLError)?
334 .clear()
335 .extend(&it);
336
337 if matched {
338 url_obj.set_path(&format!("{}/", url_obj.path()));
339 }
340 }
341
342 if url_obj.host_str().is_some() {
343 lazy_static! {
344 static ref RE: Regex = Regex::new(r"\.$").unwrap();
345 }
346 url_obj
348 .set_host(Some(
349 &RE.replace(url_obj.host_str().unwrap(), "").to_string(),
350 ))
351 .map_err(Into::into)
352 .map_err(NormalizeUrlError::UnexpectedError)?;
353
354 if options.strip_www {
356 lazy_static! {
357 static ref RE: Regex =
358 Regex::new(r"^www\.(?!www\.)[a-z\-\d]{1,63}\.[a-z.\-\d]{2,63}$").unwrap();
359 static ref RE2: Regex = Regex::new(r"^www\.").unwrap();
360 }
361 let host_str = url_obj.host_str().unwrap().to_string();
366 if RE
367 .is_match(&host_str)
368 .map_err(Into::into)
369 .map_err(NormalizeUrlError::UnexpectedError)?
370 {
371 url_obj
372 .set_host(Some(&RE2.replace(&host_str, "")))
373 .map_err(Into::into)
374 .map_err(NormalizeUrlError::UnexpectedError)?;
375 }
376 }
377 }
378
379 if let RemoveQueryParametersOptions::List(ref regexs) = options.remove_query_parameters {
381 let url_copy = url_obj.clone();
382 let mut query_pairs = url_obj.query_pairs_mut();
383 query_pairs.clear();
384
385 for (key, value) in url_copy.query_pairs() {
386 let mut matched = false;
387 for regex in regexs {
388 if regex
389 .is_match(&key)
390 .map_err(Into::into)
391 .map_err(NormalizeUrlError::UnexpectedError)?
392 {
393 matched = true;
394 break;
395 }
396 }
397
398 if !matched {
399 query_pairs.append_pair(&key, &value);
400 }
401 }
402
403 query_pairs.finish();
404 }
405
406 if options.keep_query_parameters.is_none() {
407 if let RemoveQueryParametersOptions::All = &options.remove_query_parameters {
408 url_obj.set_query(None);
409 }
410 }
411
412 if options.keep_query_parameters.is_some() {
414 let url_copy = url_obj.clone();
415 let mut query_pairs = url_obj.query_pairs_mut();
416 query_pairs.clear();
417 for (key, value) in url_copy.query_pairs() {
418 for regex in options.keep_query_parameters.as_ref().unwrap() {
419 if regex
420 .is_match(&key)
421 .map_err(Into::into)
422 .map_err(NormalizeUrlError::UnexpectedError)?
423 {
424 query_pairs.append_pair(&key, &value);
425 break;
426 }
427 }
428 }
429 query_pairs.finish();
430 }
431
432 if let Some(query_str) = url_obj.query() {
433 if query_str.is_empty() {
434 url_obj.set_query(None);
435 }
436 }
437
438 if options.sort_query_parameters && url_obj.query_pairs().count() > 0 {
440 {
441 let url_copy = url_obj.clone();
442 let mut query_pairs = url_obj.query_pairs_mut();
443 query_pairs.clear();
444 let mut pairs = url_copy.query_pairs().collect::<Vec<_>>();
445 pairs.sort_by(|a, b| a.0.cmp(&b.0));
446 query_pairs.extend_pairs(pairs).finish();
447 }
448
449 if let Some(query) = url_obj.query() {
450 let decoded_query = decode(query).unwrap_or(std::borrow::Cow::Borrowed(query));
451 url_obj.set_query(Some(&decoded_query.to_string()));
452 }
453 }
454
455 if options.remove_trailing_slash {
456 lazy_static! {
457 static ref RE: Regex = Regex::new(r"\/$").unwrap();
458 }
459 url_obj.set_path(&RE.replace(url_obj.path(), "").to_string());
460 }
461
462 if options.remove_explicit_port && url_obj.port().is_some() {
464 url_obj
465 .set_port(None)
466 .map_err(|()| NormalizeUrlError::URLError)?;
467 }
468
469 let old_url_string = url_string;
470
471 url_string = url_obj.to_string();
472
473 let is_option_empty = |x: Option<&str>| -> bool {
474 match x {
475 Some("") => true,
476 None => true,
477 _ => false,
478 }
479 };
480
481 if !options.remove_single_slash
482 && url_obj.path() == "/"
483 && !old_url_string.ends_with('/')
484 && is_option_empty(url_obj.fragment())
485 {
486 lazy_static! {
487 static ref RE: Regex = Regex::new(r"\/$").unwrap();
488 }
489 url_string = RE.replace(&url_string, "").to_string();
490 }
491
492 if (options.remove_trailing_slash || url_obj.path() == "/")
494 && is_option_empty(url_obj.fragment())
495 && options.remove_single_slash
496 {
497 lazy_static! {
498 static ref RE: Regex = Regex::new(r"\/$").unwrap();
499 }
500 url_string = RE.replace(&url_string, "").to_string();
501 }
502
503 if has_relative_protocol && !options.normalize_protocol {
505 lazy_static! {
506 static ref RE: Regex = Regex::new(r"^http:\/\/").unwrap();
507 }
508 url_string = RE.replace(&url_string, "//").to_string();
509 }
510
511 if options.strip_protocol {
513 lazy_static! {
514 static ref RE: Regex = Regex::new(r"^(?:https?:)?\/\/").unwrap();
515 }
516 url_string = RE.replace(&url_string, "").to_string();
517 }
518
519 Ok(url_string)
520}