normalize_url_rs/
lib.rs

1/*!
2
3normalize-url-rs is a port of Node.js [normalize-url](https://github.com/sindresorhus/normalize-url) package
4for the [Rust](http://rust-lang.org/) programming language.
5
6# Sample usage
7```
8use normalize_url_rs::{normalize_url, OptionsBuilder};
9
10let options = OptionsBuilder::default().build().unwrap();
11let result = normalize_url("https://www.rust-lang.org/", &options);
12
13assert_eq!(result.unwrap(), "https://rust-lang.org")
14```
15
16# Known differences vs original Node.js library
17
18- Custom protocols are not supported
19- Data URLs are not supported
20*/
21
22use derive_builder::Builder;
23use fancy_regex::Regex;
24use lazy_static::lazy_static;
25use std::iter::Peekable;
26use thiserror::Error;
27use url::Url;
28use urlencoding::decode;
29
30struct SkipLastIterator<I: Iterator>(Peekable<I>);
31impl<I: Iterator> Iterator for SkipLastIterator<I> {
32    type Item = I::Item;
33    fn next(&mut self) -> Option<Self::Item> {
34        let item = self.0.next();
35        match self.0.peek() {
36            Some(_) => Some(item.unwrap()),
37            None => None,
38        }
39    }
40}
41trait SkipLast: Iterator + Sized {
42    fn skip_last(self) -> SkipLastIterator<Self> {
43        SkipLastIterator(self.peekable())
44    }
45}
46impl<I: Iterator> SkipLast for I {}
47
48#[derive(Debug, Clone)]
49/// Controls whether query parameters will be removed.
50pub enum RemoveQueryParametersOptions {
51    /// No query parameters will be removed.
52    None,
53    /// All query parameters will be removed.
54    All,
55    /// Only query parameters matching provided regular expressions will be removed.
56    List(Vec<Regex>),
57}
58
59#[derive(Debug, Clone)]
60/// Controls whether directory index will be removed.
61pub enum RemoveDirectoryIndexOptions {
62    /// No directory indices will be removed.
63    None,
64    /// Default regex `^index\.[a-z]+$` wil be used.
65    Default,
66    /// Only directory indices matching provided regular expressions will be removed.
67    List(Vec<Regex>),
68}
69
70#[derive(Builder, Debug, Clone)]
71#[builder(setter(into))]
72/// Normalization options.
73pub struct Options {
74    #[builder(default = "\"http\".to_string()")]
75    /// Default protocol.
76    ///
77    /// Default value: `http`.
78    pub default_protocol: String,
79    /// Prepend `defaultProtocol` to the URL if it's protocol-relative.
80    ///
81    /// Default value: `true`.
82    #[builder(default = "true")]
83    pub normalize_protocol: bool,
84    /// Normalize HTTPS to HTTP.
85    ///
86    /// Default value: `false`.
87    #[builder(default = "false")]
88    pub force_http: bool,
89    /// Normalize HTTP to HTTPS.
90    ///
91    /// This option cannot be used with the `force_http` option at the same time.
92    ///
93    /// Default value: `false`.
94    #[builder(default = "false")]
95    pub force_https: bool,
96    /// Strip the authentication part of the URL.
97    ///
98    /// Default value: `true`.
99    #[builder(default = "true")]
100    pub strip_authentication: bool,
101    /// Strip the hash part of the URL.
102    ///
103    /// Default value: `false`.
104    #[builder(default = "false")]
105    pub strip_hash: bool,
106    /// Remove the protocol from the URL: `http://sindresorhus.com` → `sindresorhus.com`.
107    ///
108    /// It will only remove `https://` and `http://` protocols.
109    ///
110    /// Default value: `false`.
111    #[builder(default = "false")]
112    pub strip_protocol: bool,
113    /// Strip the text fragment part of the URL.
114    ///
115    /// **Note**: The text fragment will always be removed if the `strip_hash` option is set to true, as the hash contains the text fragment.
116    ///
117    /// Default value: `true`.
118    #[builder(default = "true")]
119    pub strip_text_fragment: bool,
120    /// Remove www. from the URL.
121    ///
122    /// Default value: `true`.
123    #[builder(default = "true")]
124    pub strip_www: bool,
125    /// Remove query parameters that matches any of the provided strings or regexes.
126    ///
127    /// Default value: `^utm_\w+`.
128    #[builder(
129        default = "RemoveQueryParametersOptions::List(vec![Regex::new(r\"^utm_\\w+\").unwrap()])"
130    )]
131    pub remove_query_parameters: RemoveQueryParametersOptions,
132    /// Keeps only query parameters that matches any of the provided strings or regexes.
133    ///
134    /// **Note**: It overrides the `remove_query_parameters` option.
135    ///
136    /// Default value: `None`.
137    #[builder(default = "None")]
138    pub keep_query_parameters: Option<Vec<Regex>>,
139    /// Remove trailing slash.
140    ///
141    /// **Note**: Trailing slash is always removed if the URL doesn't have a pathname unless the `remove_single_slash` option is set to false.
142    ///
143    /// Default value: `true`.
144    #[builder(default = "true")]
145    pub remove_trailing_slash: bool,
146    /// Remove a sole `/` pathname in the output. This option is independent of `remove_trailing_slash`.
147    ///
148    /// Default value: `true`.
149    #[builder(default = "true")]
150    pub remove_single_slash: bool,
151    /// Removes the default directory index file from path that matches any of the provided strings or regexes. When `true`, the regex `^index\.[a-z]+$` is used.
152    ///
153    /// Default value: `None`.
154    #[builder(default = "RemoveDirectoryIndexOptions::None")]
155    pub remove_directory_index: RemoveDirectoryIndexOptions,
156    /// Removes an explicit port number from the URL.
157    ///
158    /// Port 443 is always removed from HTTPS URLs and 80 is always removed from HTTP URLs regardless of this option.
159    ///
160    /// Default value: `false`.
161    #[builder(default = "false")]
162    pub remove_explicit_port: bool,
163    /// Sorts the query parameters alphabetically by key.
164    ///
165    /// Default value: `true`.
166    #[builder(default = "true")]
167    pub sort_query_parameters: bool,
168}
169
170#[derive(Error, Debug)]
171/// Errors that can occur during normalization.
172pub enum NormalizeUrlError {
173    #[error("The `forceHttp` and `forceHttps` options cannot be used together")]
174    ForceHttpAndHttpAreExclusive,
175    #[error("Unexpected error returned by `Url` library")]
176    URLError,
177    #[error("Unexpected error")]
178    UnexpectedError(#[from] anyhow::Error),
179}
180
181pub fn normalize_url(url: &str, options: &Options) -> Result<String, NormalizeUrlError> {
182    if options.force_http && options.force_https {
183        return Err(NormalizeUrlError::ForceHttpAndHttpAreExclusive);
184    }
185
186    let mut url_string = url.trim().to_owned();
187
188    // Data URL
189    //if (/^data:/i.test(urlString)) {
190    //	return normalizeDataURL(urlString, options);
191    //}
192    //
193    //if (hasCustomProtocol(urlString)) {
194    //	return urlString;
195    //}
196    //
197
198    let has_relative_protocol = url_string.starts_with("//");
199    let is_relative_url = !has_relative_protocol && {
200        lazy_static! {
201            static ref RE: Regex = Regex::new(r"^\.*\/").unwrap();
202        }
203        RE.is_match(&url_string)
204            .map_err(Into::into)
205            .map_err(NormalizeUrlError::UnexpectedError)?
206    };
207
208    // Prepend protocol
209    if !is_relative_url {
210        lazy_static! {
211            static ref RE: Regex = Regex::new(r"^(?!(?:\w+:)?\/\/)|^\/\/").unwrap();
212        }
213        url_string = RE
214            .replace(&url_string, format!("{}://", options.default_protocol))
215            .to_string();
216    }
217
218    let mut url_obj = Url::parse(&url_string)
219        .map_err(Into::into)
220        .map_err(NormalizeUrlError::UnexpectedError)?;
221
222    if options.force_http && url_obj.scheme() == "https" {
223        url_obj
224            .set_scheme("http")
225            .map_err(|()| NormalizeUrlError::URLError)?;
226    }
227
228    if options.force_https && url_obj.scheme() == "http" {
229        url_obj
230            .set_scheme("https")
231            .map_err(|()| NormalizeUrlError::URLError)?;
232    }
233
234    // Remove auth
235    if options.strip_authentication {
236        url_obj
237            .set_username("")
238            .map_err(|()| NormalizeUrlError::URLError)?;
239        url_obj
240            .set_password(None)
241            .map_err(|()| NormalizeUrlError::URLError)?;
242    }
243
244    // Remove hash
245    if options.strip_hash {
246        url_obj.set_fragment(None);
247    } else if options.strip_text_fragment && url_obj.fragment().is_some() {
248        lazy_static! {
249            static ref RE: Regex = Regex::new(r"#?:~:text.*?$").unwrap();
250        }
251        let new_fragment = RE.replace(url_obj.fragment().unwrap(), "").to_string();
252        url_obj.set_fragment(match new_fragment.is_empty() {
253            true => None,
254            false => Some(&new_fragment),
255        });
256    }
257
258    // Remove duplicate slashes if not preceded by a protocol
259    if url_obj.path().len() > 0 {
260        // Split the string by occurrences of this protocol regex, and perform
261        // duplicate-slash replacement on the strings between those occurrences
262        // (if any).
263        lazy_static! {
264            static ref RE: Regex = Regex::new(r"\b[a-z][a-z\d+\-.]{1,50}:\/\/").unwrap();
265            static ref RE2: Regex = Regex::new(r"\/{2,}").unwrap();
266        }
267
268        let mut last_index = 0;
269        let mut result = "".to_string();
270        for re_match in RE.captures_iter(url_obj.path()) {
271            let re_match = re_match
272                .map_err(Into::into)
273                .map_err(NormalizeUrlError::UnexpectedError)?;
274
275            let protocol = re_match.get(0).unwrap();
276            let protocol_at_index = protocol.start();
277            let intermediate = &url_obj.path()[last_index..protocol_at_index];
278
279            result += &RE2.replace_all(intermediate, "/");
280            result += protocol.as_str();
281            last_index = protocol_at_index + protocol.as_str().len();
282        }
283
284        let remnant = &url_obj.path()[last_index..];
285        result += &RE2.replace_all(remnant, "/");
286
287        url_obj.set_path(&result);
288    }
289
290    // Decode URI octets
291    if !url_obj.path().is_empty() {
292        let decoded_path =
293            decode(url_obj.path()).unwrap_or(std::borrow::Cow::Borrowed(url_obj.path()));
294        url_obj.set_path(&decoded_path.to_string());
295    }
296
297    // Remove directory index
298    let remove_directory_index_regexs = match &options.remove_directory_index {
299        RemoveDirectoryIndexOptions::None => vec![],
300        RemoveDirectoryIndexOptions::Default => vec![Regex::new(r"^index\.[a-z]+$").unwrap()],
301        RemoveDirectoryIndexOptions::List(regexs) => regexs.to_vec(),
302    };
303
304    if remove_directory_index_regexs.len() > 0 && url_obj.path_segments().is_some() {
305        let mut matched = false;
306        let path_segments = url_obj
307            .path_segments()
308            .unwrap()
309            .map(ToOwned::to_owned)
310            .collect::<Vec<_>>();
311
312        if let Some(last_path) = path_segments.last() {
313            for regex in &remove_directory_index_regexs {
314                if regex
315                    .is_match(last_path)
316                    .map_err(Into::into)
317                    .map_err(NormalizeUrlError::UnexpectedError)?
318                {
319                    matched = true;
320                    break;
321                }
322            }
323        }
324
325        let it = match matched {
326            true => path_segments.iter().skip_last().collect::<Vec<_>>(),
327            false => path_segments.iter().collect(),
328        };
329
330        url_obj
331            .path_segments_mut()
332            .map_err(Into::into)
333            .map_err(|()| NormalizeUrlError::URLError)?
334            .clear()
335            .extend(&it);
336
337        if matched {
338            url_obj.set_path(&format!("{}/", url_obj.path()));
339        }
340    }
341
342    if url_obj.host_str().is_some() {
343        lazy_static! {
344            static ref RE: Regex = Regex::new(r"\.$").unwrap();
345        }
346        // Remove trailing dot
347        url_obj
348            .set_host(Some(
349                &RE.replace(url_obj.host_str().unwrap(), "").to_string(),
350            ))
351            .map_err(Into::into)
352            .map_err(NormalizeUrlError::UnexpectedError)?;
353
354        // Remove `www.`
355        if options.strip_www {
356            lazy_static! {
357                static ref RE: Regex =
358                    Regex::new(r"^www\.(?!www\.)[a-z\-\d]{1,63}\.[a-z.\-\d]{2,63}$").unwrap();
359                static ref RE2: Regex = Regex::new(r"^www\.").unwrap();
360            }
361            // Each label should be max 63 at length (min: 1).
362            // Source: https://en.wikipedia.org/wiki/Hostname#Restrictions_on_valid_host_names
363            // Each TLD should be up to 63 characters long (min: 2).
364            // It is technically possible to have a single character TLD, but none currently exist.
365            let host_str = url_obj.host_str().unwrap().to_string();
366            if RE
367                .is_match(&host_str)
368                .map_err(Into::into)
369                .map_err(NormalizeUrlError::UnexpectedError)?
370            {
371                url_obj
372                    .set_host(Some(&RE2.replace(&host_str, "")))
373                    .map_err(Into::into)
374                    .map_err(NormalizeUrlError::UnexpectedError)?;
375            }
376        }
377    }
378
379    // Remove query unwanted parameters
380    if let RemoveQueryParametersOptions::List(ref regexs) = options.remove_query_parameters {
381        let url_copy = url_obj.clone();
382        let mut query_pairs = url_obj.query_pairs_mut();
383        query_pairs.clear();
384
385        for (key, value) in url_copy.query_pairs() {
386            let mut matched = false;
387            for regex in regexs {
388                if regex
389                    .is_match(&key)
390                    .map_err(Into::into)
391                    .map_err(NormalizeUrlError::UnexpectedError)?
392                {
393                    matched = true;
394                    break;
395                }
396            }
397
398            if !matched {
399                query_pairs.append_pair(&key, &value);
400            }
401        }
402
403        query_pairs.finish();
404    }
405
406    if options.keep_query_parameters.is_none() {
407        if let RemoveQueryParametersOptions::All = &options.remove_query_parameters {
408            url_obj.set_query(None);
409        }
410    }
411
412    // Keep wanted query parameters
413    if options.keep_query_parameters.is_some() {
414        let url_copy = url_obj.clone();
415        let mut query_pairs = url_obj.query_pairs_mut();
416        query_pairs.clear();
417        for (key, value) in url_copy.query_pairs() {
418            for regex in options.keep_query_parameters.as_ref().unwrap() {
419                if regex
420                    .is_match(&key)
421                    .map_err(Into::into)
422                    .map_err(NormalizeUrlError::UnexpectedError)?
423                {
424                    query_pairs.append_pair(&key, &value);
425                    break;
426                }
427            }
428        }
429        query_pairs.finish();
430    }
431
432    if let Some(query_str) = url_obj.query() {
433        if query_str.is_empty() {
434            url_obj.set_query(None);
435        }
436    }
437
438    // Sort query parameters
439    if options.sort_query_parameters && url_obj.query_pairs().count() > 0 {
440        {
441            let url_copy = url_obj.clone();
442            let mut query_pairs = url_obj.query_pairs_mut();
443            query_pairs.clear();
444            let mut pairs = url_copy.query_pairs().collect::<Vec<_>>();
445            pairs.sort_by(|a, b| a.0.cmp(&b.0));
446            query_pairs.extend_pairs(pairs).finish();
447        }
448
449        if let Some(query) = url_obj.query() {
450            let decoded_query = decode(query).unwrap_or(std::borrow::Cow::Borrowed(query));
451            url_obj.set_query(Some(&decoded_query.to_string()));
452        }
453    }
454
455    if options.remove_trailing_slash {
456        lazy_static! {
457            static ref RE: Regex = Regex::new(r"\/$").unwrap();
458        }
459        url_obj.set_path(&RE.replace(url_obj.path(), "").to_string());
460    }
461
462    // Remove an explicit port number, excluding a default port number, if applicable
463    if options.remove_explicit_port && url_obj.port().is_some() {
464        url_obj
465            .set_port(None)
466            .map_err(|()| NormalizeUrlError::URLError)?;
467    }
468
469    let old_url_string = url_string;
470
471    url_string = url_obj.to_string();
472
473    let is_option_empty = |x: Option<&str>| -> bool {
474        match x {
475            Some("") => true,
476            None => true,
477            _ => false,
478        }
479    };
480
481    if !options.remove_single_slash
482        && url_obj.path() == "/"
483        && !old_url_string.ends_with('/')
484        && is_option_empty(url_obj.fragment())
485    {
486        lazy_static! {
487            static ref RE: Regex = Regex::new(r"\/$").unwrap();
488        }
489        url_string = RE.replace(&url_string, "").to_string();
490    }
491
492    // Remove ending `/` unless removeSingleSlash is false
493    if (options.remove_trailing_slash || url_obj.path() == "/")
494        && is_option_empty(url_obj.fragment())
495        && options.remove_single_slash
496    {
497        lazy_static! {
498            static ref RE: Regex = Regex::new(r"\/$").unwrap();
499        }
500        url_string = RE.replace(&url_string, "").to_string();
501    }
502
503    // Restore relative protocol, if applicable
504    if has_relative_protocol && !options.normalize_protocol {
505        lazy_static! {
506            static ref RE: Regex = Regex::new(r"^http:\/\/").unwrap();
507        }
508        url_string = RE.replace(&url_string, "//").to_string();
509    }
510
511    // Remove http/https
512    if options.strip_protocol {
513        lazy_static! {
514            static ref RE: Regex = Regex::new(r"^(?:https?:)?\/\/").unwrap();
515        }
516        url_string = RE.replace(&url_string, "").to_string();
517    }
518
519    Ok(url_string)
520}