use crate::Url;
use regex::Regex;
use std::collections::HashSet;
use std::sync::LazyLock;
static PROTOCOL_PATTERN: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r#"(?i)^([a-z0-9.+-]+:)"#).unwrap());
static PORT_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#":[0-9]*$"#).unwrap());
const NON_HOST_CHARS: [char; 8 + 6 + 1 + 5] = [
'<', '>', '"', '`', ' ', '\r', '\n', '\t', '{', '}', '|', '\\', '^', '`', '\'', '%', '/', '?', ';', '#', ];
const HOST_ENDING_CHARS: [char; 3] = ['/', '?', '#'];
static HOSTNAME_PART_PATTERN: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r#"^[+a-z0-9A-Z_-]{0,63}$"#).unwrap());
static HOSTNAME_PART_START: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r#"^([+a-z0-9A-Z_-]{0,63})(.*)$"#).unwrap());
static HOSTLESS_PROTOCOL: LazyLock<HashSet<&'static str>> =
LazyLock::new(|| HashSet::from_iter(["javascript", "javascript:"].iter().copied()));
static SLASHED_PROTOCOL: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
HashSet::from_iter(
[
"http", "https", "ftp", "gopher", "file", "http:", "https:", "ftp:", "gopher:", "file:",
]
.iter()
.copied(),
)
});
pub fn parse_url(url: &str) -> Url {
let mut this = Url::default();
let mut rest = url;
rest = rest.trim();
if let Some(proto_match) = PROTOCOL_PATTERN.captures(rest) {
let proto = Some(proto_match.get(0).unwrap().as_str());
this.protocol = proto.map(|s| s.into());
rest = &rest[proto.unwrap().len()..];
}
let slashes = rest.starts_with("//");
if slashes
&& !(this.protocol.is_some()
&& HOSTLESS_PROTOCOL.contains(this.protocol.as_ref().unwrap().as_str()))
{
rest = &rest[2..];
this.slashes = true;
}
if (this.protocol.is_none()
|| !HOSTLESS_PROTOCOL.contains(this.protocol.as_ref().unwrap().as_str()))
&& (this.slashes
|| (this.protocol.is_some()
&& !SLASHED_PROTOCOL.contains(this.protocol.as_ref().unwrap().as_str())))
{
let host_end = rest.find(HOST_ENDING_CHARS);
let at_sign = if let Some(host_end) = host_end {
rest[..host_end].rfind('@')
} else {
rest.rfind('@')
};
if let Some(at_sign) = at_sign {
this.auth = Some(rest[..at_sign].into());
rest = &rest[at_sign + 1..];
}
let host_end = rest.find(NON_HOST_CHARS);
let mut host_end = host_end.unwrap_or(rest.len());
if rest[..host_end].ends_with(':') {
host_end -= 1;
}
let mut host = &rest[..host_end];
rest = &rest[host_end..];
if let Some(port_match) = PORT_PATTERN.captures(host) {
let port = port_match.get(0).unwrap().as_str();
if port != ":" {
this.port = Some(port[1..].into());
}
host = &host[..host.len() - port.len()];
}
this.hostname = Some(host.into());
let check_hostname = this.hostname.as_ref().unwrap().as_str();
let ipv6_hostname = check_hostname.starts_with('[') && check_hostname.ends_with(']');
if !ipv6_hostname {
let hostparts = this
.hostname
.as_ref()
.unwrap()
.split('.')
.collect::<Vec<_>>();
for (i, part) in hostparts.iter().enumerate() {
if part.is_empty() {
continue;
}
if !HOSTNAME_PART_PATTERN.is_match(part) {
let newpart = part
.chars()
.map(|c| if c as u32 > 127 { 'x' } else { c })
.collect::<String>();
if !HOSTNAME_PART_PATTERN.is_match(&newpart) {
let mut valid_parts = hostparts[..i].to_vec();
let mut not_host = hostparts[i + 1..].to_vec();
if let Some(bit) = HOSTNAME_PART_START.captures(part) {
valid_parts.push(bit.get(1).unwrap().as_str());
not_host.push(bit.get(2).unwrap().as_str());
}
if !not_host.is_empty() {
rest = &url[url.len() - rest.len() - not_host.join(".").len()..];
}
this.hostname = Some(valid_parts.join("."));
break;
}
}
}
}
if ipv6_hostname {
let hostname = this.hostname.as_ref().unwrap().as_str();
this.hostname = Some(hostname[1..hostname.len() - 1].into());
}
}
if let Some(hash) = rest.find('#') {
this.hash = Some(rest[hash..].into());
rest = &rest[0..hash];
}
if let Some(qm) = rest.find('?') {
this.search = Some(rest[qm..].into());
rest = &rest[0..qm];
}
if !rest.is_empty() {
this.pathname = Some(rest.into());
}
if this.protocol.is_some()
&& SLASHED_PROTOCOL.contains(
this.protocol
.as_ref()
.unwrap()
.to_ascii_lowercase()
.as_str(),
)
&& this.hostname.is_some()
&& !this.hostname.as_ref().unwrap().is_empty()
&& this.pathname.is_none()
{
this.pathname = Some(String::new());
}
this
}