use derive_builder::Builder;
use fancy_regex::Regex;
use lazy_static::lazy_static;
use std::iter::Peekable;
use thiserror::Error;
use url::Url;
use urlencoding::decode;
struct SkipLastIterator<I: Iterator>(Peekable<I>);
impl<I: Iterator> Iterator for SkipLastIterator<I> {
type Item = I::Item;
fn next(&mut self) -> Option<Self::Item> {
let item = self.0.next();
match self.0.peek() {
Some(_) => Some(item.unwrap()),
None => None,
}
}
}
trait SkipLast: Iterator + Sized {
fn skip_last(self) -> SkipLastIterator<Self> {
SkipLastIterator(self.peekable())
}
}
impl<I: Iterator> SkipLast for I {}
#[derive(Debug, Clone)]
pub enum RemoveQueryParametersOptions {
None,
All,
List(Vec<Regex>),
}
#[derive(Debug, Clone)]
pub enum RemoveDirectoryIndexOptions {
None,
Default,
List(Vec<Regex>),
}
#[derive(Builder, Debug, Clone)]
#[builder(setter(into))]
pub struct Options {
#[builder(default = "\"http\".to_string()")]
pub default_protocol: String,
#[builder(default = "true")]
pub normalize_protocol: bool,
#[builder(default = "false")]
pub force_http: bool,
#[builder(default = "false")]
pub force_https: bool,
#[builder(default = "true")]
pub strip_authentication: bool,
#[builder(default = "false")]
pub strip_hash: bool,
#[builder(default = "false")]
pub strip_protocol: bool,
#[builder(default = "true")]
pub strip_text_fragment: bool,
#[builder(default = "true")]
pub strip_www: bool,
#[builder(
default = "RemoveQueryParametersOptions::List(vec![Regex::new(r\"^utm_\\w+\").unwrap()])"
)]
pub remove_query_parameters: RemoveQueryParametersOptions,
#[builder(default = "None")]
pub keep_query_parameters: Option<Vec<Regex>>,
#[builder(default = "true")]
pub remove_trailing_slash: bool,
#[builder(default = "true")]
pub remove_single_slash: bool,
#[builder(default = "RemoveDirectoryIndexOptions::None")]
pub remove_directory_index: RemoveDirectoryIndexOptions,
#[builder(default = "false")]
pub remove_explicit_port: bool,
#[builder(default = "true")]
pub sort_query_parameters: bool,
}
#[derive(Error, Debug)]
pub enum NormalizeUrlError {
#[error("The `forceHttp` and `forceHttps` options cannot be used together")]
ForceHttpAndHttpAreExclusive,
#[error("Unexpected error returned by `Url` library")]
URLError,
#[error("Unexpected error")]
UnexpectedError(#[from] anyhow::Error),
}
pub fn normalize_url(url: &str, options: &Options) -> Result<String, NormalizeUrlError> {
if options.force_http && options.force_https {
return Err(NormalizeUrlError::ForceHttpAndHttpAreExclusive);
}
let mut url_string = url.trim().to_owned();
let has_relative_protocol = url_string.starts_with("//");
let is_relative_url = !has_relative_protocol && {
lazy_static! {
static ref RE: Regex = Regex::new(r"^\.*\/").unwrap();
}
RE.is_match(&url_string)
.map_err(Into::into)
.map_err(NormalizeUrlError::UnexpectedError)?
};
if !is_relative_url {
lazy_static! {
static ref RE: Regex = Regex::new(r"^(?!(?:\w+:)?\/\/)|^\/\/").unwrap();
}
url_string = RE
.replace(&url_string, format!("{}://", options.default_protocol))
.to_string();
}
let mut url_obj = Url::parse(&url_string)
.map_err(Into::into)
.map_err(NormalizeUrlError::UnexpectedError)?;
if options.force_http && url_obj.scheme() == "https" {
url_obj
.set_scheme("http")
.map_err(|()| NormalizeUrlError::URLError)?;
}
if options.force_https && url_obj.scheme() == "http" {
url_obj
.set_scheme("https")
.map_err(|()| NormalizeUrlError::URLError)?;
}
if options.strip_authentication {
url_obj
.set_username("")
.map_err(|()| NormalizeUrlError::URLError)?;
url_obj
.set_password(None)
.map_err(|()| NormalizeUrlError::URLError)?;
}
if options.strip_hash {
url_obj.set_fragment(None);
} else if options.strip_text_fragment && url_obj.fragment().is_some() {
lazy_static! {
static ref RE: Regex = Regex::new(r"#?:~:text.*?$").unwrap();
}
let new_fragment = RE.replace(url_obj.fragment().unwrap(), "").to_string();
url_obj.set_fragment(match new_fragment.is_empty() {
true => None,
false => Some(&new_fragment),
});
}
if url_obj.path().len() > 0 {
lazy_static! {
static ref RE: Regex = Regex::new(r"\b[a-z][a-z\d+\-.]{1,50}:\/\/").unwrap();
static ref RE2: Regex = Regex::new(r"\/{2,}").unwrap();
}
let mut last_index = 0;
let mut result = "".to_string();
for re_match in RE.captures_iter(url_obj.path()) {
let re_match = re_match
.map_err(Into::into)
.map_err(NormalizeUrlError::UnexpectedError)?;
let protocol = re_match.get(0).unwrap();
let protocol_at_index = protocol.start();
let intermediate = &url_obj.path()[last_index..protocol_at_index];
result += &RE2.replace_all(intermediate, "/");
result += protocol.as_str();
last_index = protocol_at_index + protocol.as_str().len();
}
let remnant = &url_obj.path()[last_index..];
result += &RE2.replace_all(remnant, "/");
url_obj.set_path(&result);
}
if !url_obj.path().is_empty() {
let decoded_path =
decode(url_obj.path()).unwrap_or(std::borrow::Cow::Borrowed(url_obj.path()));
url_obj.set_path(&decoded_path.to_string());
}
let remove_directory_index_regexs = match &options.remove_directory_index {
RemoveDirectoryIndexOptions::None => vec![],
RemoveDirectoryIndexOptions::Default => vec![Regex::new(r"^index\.[a-z]+$").unwrap()],
RemoveDirectoryIndexOptions::List(regexs) => regexs.to_vec(),
};
if remove_directory_index_regexs.len() > 0 && url_obj.path_segments().is_some() {
let mut matched = false;
let path_segments = url_obj
.path_segments()
.unwrap()
.map(ToOwned::to_owned)
.collect::<Vec<_>>();
if let Some(last_path) = path_segments.last() {
for regex in &remove_directory_index_regexs {
if regex
.is_match(last_path)
.map_err(Into::into)
.map_err(NormalizeUrlError::UnexpectedError)?
{
matched = true;
break;
}
}
}
let it = match matched {
true => path_segments.iter().skip_last().collect::<Vec<_>>(),
false => path_segments.iter().collect(),
};
url_obj
.path_segments_mut()
.map_err(Into::into)
.map_err(|()| NormalizeUrlError::URLError)?
.clear()
.extend(&it);
if matched {
url_obj.set_path(&format!("{}/", url_obj.path()));
}
}
if url_obj.host_str().is_some() {
lazy_static! {
static ref RE: Regex = Regex::new(r"\.$").unwrap();
}
url_obj
.set_host(Some(
&RE.replace(url_obj.host_str().unwrap(), "").to_string(),
))
.map_err(Into::into)
.map_err(NormalizeUrlError::UnexpectedError)?;
if options.strip_www {
lazy_static! {
static ref RE: Regex =
Regex::new(r"^www\.(?!www\.)[a-z\-\d]{1,63}\.[a-z.\-\d]{2,63}$").unwrap();
static ref RE2: Regex = Regex::new(r"^www\.").unwrap();
}
let host_str = url_obj.host_str().unwrap().to_string();
if RE
.is_match(&host_str)
.map_err(Into::into)
.map_err(NormalizeUrlError::UnexpectedError)?
{
url_obj
.set_host(Some(&RE2.replace(&host_str, "")))
.map_err(Into::into)
.map_err(NormalizeUrlError::UnexpectedError)?;
}
}
}
if let RemoveQueryParametersOptions::List(ref regexs) = options.remove_query_parameters {
let url_copy = url_obj.clone();
let mut query_pairs = url_obj.query_pairs_mut();
query_pairs.clear();
for (key, value) in url_copy.query_pairs() {
let mut matched = false;
for regex in regexs {
if regex
.is_match(&key)
.map_err(Into::into)
.map_err(NormalizeUrlError::UnexpectedError)?
{
matched = true;
break;
}
}
if !matched {
query_pairs.append_pair(&key, &value);
}
}
query_pairs.finish();
}
if options.keep_query_parameters.is_none() {
if let RemoveQueryParametersOptions::All = &options.remove_query_parameters {
url_obj.set_query(None);
}
}
if options.keep_query_parameters.is_some() {
let url_copy = url_obj.clone();
let mut query_pairs = url_obj.query_pairs_mut();
query_pairs.clear();
for (key, value) in url_copy.query_pairs() {
for regex in options.keep_query_parameters.as_ref().unwrap() {
if regex
.is_match(&key)
.map_err(Into::into)
.map_err(NormalizeUrlError::UnexpectedError)?
{
query_pairs.append_pair(&key, &value);
break;
}
}
}
query_pairs.finish();
}
if let Some(query_str) = url_obj.query() {
if query_str.is_empty() {
url_obj.set_query(None);
}
}
if options.sort_query_parameters && url_obj.query_pairs().count() > 0 {
{
let url_copy = url_obj.clone();
let mut query_pairs = url_obj.query_pairs_mut();
query_pairs.clear();
let mut pairs = url_copy.query_pairs().collect::<Vec<_>>();
pairs.sort_by(|a, b| a.0.cmp(&b.0));
query_pairs.extend_pairs(pairs).finish();
}
if let Some(query) = url_obj.query() {
let decoded_query = decode(query).unwrap_or(std::borrow::Cow::Borrowed(query));
url_obj.set_query(Some(&decoded_query.to_string()));
}
}
if options.remove_trailing_slash {
lazy_static! {
static ref RE: Regex = Regex::new(r"\/$").unwrap();
}
url_obj.set_path(&RE.replace(url_obj.path(), "").to_string());
}
if options.remove_explicit_port && url_obj.port().is_some() {
url_obj
.set_port(None)
.map_err(|()| NormalizeUrlError::URLError)?;
}
let old_url_string = url_string;
url_string = url_obj.to_string();
let is_option_empty = |x: Option<&str>| -> bool {
match x {
Some("") => true,
None => true,
_ => false,
}
};
if !options.remove_single_slash
&& url_obj.path() == "/"
&& !old_url_string.ends_with('/')
&& is_option_empty(url_obj.fragment())
{
lazy_static! {
static ref RE: Regex = Regex::new(r"\/$").unwrap();
}
url_string = RE.replace(&url_string, "").to_string();
}
if (options.remove_trailing_slash || url_obj.path() == "/")
&& is_option_empty(url_obj.fragment())
&& options.remove_single_slash
{
lazy_static! {
static ref RE: Regex = Regex::new(r"\/$").unwrap();
}
url_string = RE.replace(&url_string, "").to_string();
}
if has_relative_protocol && !options.normalize_protocol {
lazy_static! {
static ref RE: Regex = Regex::new(r"^http:\/\/").unwrap();
}
url_string = RE.replace(&url_string, "//").to_string();
}
if options.strip_protocol {
lazy_static! {
static ref RE: Regex = Regex::new(r"^(?:https?:)?\/\/").unwrap();
}
url_string = RE.replace(&url_string, "").to_string();
}
Ok(url_string)
}