use std::borrow::Cow;
use std::collections::HashMap;
use std::fmt;
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct NormalizedIri {
iri: String,
}
impl NormalizedIri {
pub fn new_unchecked(iri: String) -> Self {
Self { iri }
}
pub fn as_str(&self) -> &str {
&self.iri
}
pub fn into_string(self) -> String {
self.iri
}
pub fn is_equivalent(&self, other: &Self) -> bool {
self.iri == other.iri
}
}
impl fmt::Display for NormalizedIri {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.iri)
}
}
impl AsRef<str> for NormalizedIri {
fn as_ref(&self) -> &str {
&self.iri
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum NormalizationError {
InvalidFormat(String),
InvalidPercentEncoding(String),
MissingScheme,
InvalidScheme(String),
}
impl fmt::Display for NormalizationError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::InvalidFormat(msg) => write!(f, "Invalid IRI format: {}", msg),
Self::InvalidPercentEncoding(seq) => {
write!(f, "Invalid percent encoding: {}", seq)
}
Self::MissingScheme => write!(f, "IRI must have a scheme"),
Self::InvalidScheme(s) => write!(f, "Invalid scheme: {}", s),
}
}
}
impl std::error::Error for NormalizationError {}
pub type NormalizationResult<T> = Result<T, NormalizationError>;
pub fn normalize_iri(iri: &str) -> NormalizationResult<NormalizedIri> {
if iri.is_empty() {
return Err(NormalizationError::InvalidFormat(
"IRI cannot be empty".to_string(),
));
}
let components = parse_iri_components(iri)?;
let normalized = normalize_components(&components)?;
Ok(NormalizedIri::new_unchecked(normalized))
}
#[derive(Debug, Clone)]
struct IriComponents {
scheme: String,
authority: Option<Authority>,
path: String,
query: Option<String>,
fragment: Option<String>,
}
#[derive(Debug, Clone)]
struct Authority {
userinfo: Option<String>,
host: String,
port: Option<u16>,
}
fn parse_iri_components(iri: &str) -> NormalizationResult<IriComponents> {
let colon_pos = iri.find(':').ok_or(NormalizationError::MissingScheme)?;
let scheme = iri[..colon_pos].to_string();
if scheme.is_empty() || !is_valid_scheme(&scheme) {
return Err(NormalizationError::InvalidScheme(scheme));
}
let rest = &iri[colon_pos + 1..];
let (authority, path_query_fragment) = if let Some(after_slashes) = rest.strip_prefix("//") {
let auth_end = after_slashes
.find(['/', '?', '#'])
.unwrap_or(after_slashes.len());
let authority_str = &after_slashes[..auth_end];
let authority = parse_authority(authority_str)?;
(Some(authority), &after_slashes[auth_end..])
} else {
(None, rest)
};
let (path, query_fragment) = if let Some(q_pos) = path_query_fragment.find('?') {
(
path_query_fragment[..q_pos].to_string(),
&path_query_fragment[q_pos + 1..],
)
} else if let Some(f_pos) = path_query_fragment.find('#') {
(
path_query_fragment[..f_pos].to_string(),
&path_query_fragment[f_pos..],
)
} else {
(path_query_fragment.to_string(), "")
};
let (query, fragment) = if !query_fragment.is_empty() {
if let Some(f_pos) = query_fragment.find('#') {
(
Some(query_fragment[..f_pos].to_string()),
Some(query_fragment[f_pos + 1..].to_string()),
)
} else {
(Some(query_fragment.to_string()), None)
}
} else {
(None, None)
};
Ok(IriComponents {
scheme,
authority,
path,
query,
fragment,
})
}
fn parse_authority(authority: &str) -> NormalizationResult<Authority> {
if authority.is_empty() {
return Ok(Authority {
userinfo: None,
host: String::new(),
port: None,
});
}
let (userinfo, host_port) = if let Some(at_pos) = authority.rfind('@') {
(
Some(authority[..at_pos].to_string()),
&authority[at_pos + 1..],
)
} else {
(None, authority)
};
let (host, port) = parse_host_port(host_port)?;
Ok(Authority {
userinfo,
host,
port,
})
}
fn parse_host_port(host_port: &str) -> NormalizationResult<(String, Option<u16>)> {
if let Some(bracket_start) = host_port.find('[') {
let bracket_end = host_port.find(']').ok_or_else(|| {
NormalizationError::InvalidFormat("Unclosed IPv6 bracket".to_string())
})?;
let host = host_port[bracket_start..=bracket_end].to_string();
let rest = &host_port[bracket_end + 1..];
let port = if let Some(port_str) = rest.strip_prefix(':') {
Some(port_str.parse::<u16>().map_err(|_| {
NormalizationError::InvalidFormat(format!("Invalid port: {}", port_str))
})?)
} else if rest.is_empty() {
None
} else {
return Err(NormalizationError::InvalidFormat(
"Invalid characters after IPv6 address".to_string(),
));
};
return Ok((host, port));
}
if let Some(colon_pos) = host_port.rfind(':') {
let potential_port = &host_port[colon_pos + 1..];
if potential_port.chars().all(|c| c.is_ascii_digit()) {
let port = potential_port.parse::<u16>().map_err(|_| {
NormalizationError::InvalidFormat(format!("Invalid port: {}", potential_port))
})?;
Ok((host_port[..colon_pos].to_string(), Some(port)))
} else {
Ok((host_port.to_string(), None))
}
} else {
Ok((host_port.to_string(), None))
}
}
fn normalize_components(components: &IriComponents) -> NormalizationResult<String> {
let scheme = components.scheme.to_lowercase();
let authority_str = if let Some(ref auth) = components.authority {
let mut parts = Vec::new();
if let Some(ref userinfo) = auth.userinfo {
let normalized_userinfo = normalize_percent_encoding(userinfo)?;
parts.push(format!("{}@", normalized_userinfo));
}
let normalized_host = if auth.host.starts_with('[') {
auth.host.to_lowercase()
} else {
normalize_percent_encoding(&auth.host.to_lowercase())?
};
parts.push(normalized_host);
if let Some(port) = auth.port {
if !is_default_port(&scheme, port) {
parts.push(format!(":{}", port));
}
}
format!("//{}", parts.concat())
} else {
String::new()
};
let normalized_path = normalize_path(&components.path, components.authority.is_some())?;
let query_str = if let Some(ref query) = components.query {
format!("?{}", normalize_percent_encoding(query)?)
} else {
String::new()
};
let fragment_str = if let Some(ref fragment) = components.fragment {
format!("#{}", normalize_percent_encoding(fragment)?)
} else {
String::new()
};
Ok(format!(
"{}:{}{}{}{}",
scheme, authority_str, normalized_path, query_str, fragment_str
))
}
fn normalize_percent_encoding(s: &str) -> NormalizationResult<String> {
let mut result = String::with_capacity(s.len());
let mut chars = s.chars().peekable();
while let Some(ch) = chars.next() {
if ch == '%' {
let hex1 = chars
.next()
.ok_or_else(|| NormalizationError::InvalidPercentEncoding(format!("%{}", s)))?;
let hex2 = chars.next().ok_or_else(|| {
NormalizationError::InvalidPercentEncoding(format!("%{}{}", hex1, s))
})?;
let hex_str = format!("{}{}", hex1, hex2);
let byte = u8::from_str_radix(&hex_str, 16)
.map_err(|_| NormalizationError::InvalidPercentEncoding(format!("%{}", hex_str)))?;
let decoded = byte as char;
if is_unreserved(decoded) {
result.push(decoded);
} else {
result.push_str(&format!("%{}", hex_str.to_uppercase()));
}
} else {
result.push(ch);
}
}
Ok(result)
}
fn is_unreserved(ch: char) -> bool {
ch.is_ascii_alphanumeric() || ch == '-' || ch == '.' || ch == '_' || ch == '~'
}
fn normalize_path(path: &str, has_authority: bool) -> NormalizationResult<String> {
if path.is_empty() && has_authority {
return Ok("/".to_string());
}
let normalized = remove_dot_segments(path);
normalize_percent_encoding(&normalized)
}
fn remove_dot_segments(path: &str) -> String {
let mut output = Vec::new();
let segments: Vec<&str> = path.split('/').collect();
let has_trailing_slash = path.ends_with('/') && path.len() > 1;
for (i, segment) in segments.iter().enumerate() {
match *segment {
"" => {
if i == 0 {
}
}
"." => {
}
".." => {
output.pop();
}
_ => {
output.push(*segment);
}
}
}
if path.starts_with('/') {
if output.is_empty() {
"/".to_string()
} else {
let base_path = format!("/{}", output.join("/"));
if has_trailing_slash {
format!("{}/", base_path)
} else {
base_path
}
}
} else if output.is_empty() {
String::new()
} else {
let base_path = output.join("/");
if has_trailing_slash {
format!("{}/", base_path)
} else {
base_path
}
}
}
fn is_default_port(scheme: &str, port: u16) -> bool {
get_default_port(scheme) == Some(port)
}
fn get_default_port(scheme: &str) -> Option<u16> {
DEFAULT_PORTS.get(scheme).copied()
}
lazy_static::lazy_static! {
static ref DEFAULT_PORTS: HashMap<&'static str, u16> = {
let mut m = HashMap::new();
m.insert("http", 80);
m.insert("https", 443);
m.insert("ftp", 21);
m.insert("ftps", 990);
m.insert("ssh", 22);
m.insert("telnet", 23);
m.insert("smtp", 25);
m.insert("pop3", 110);
m.insert("imap", 143);
m.insert("ldap", 389);
m.insert("ldaps", 636);
m.insert("ws", 80);
m.insert("wss", 443);
m
};
}
fn is_valid_scheme(scheme: &str) -> bool {
if scheme.is_empty() {
return false;
}
let mut chars = scheme.chars();
let first = chars.next().expect("iterator should have next element");
if !first.is_ascii_alphabetic() {
return false;
}
chars.all(|c| c.is_ascii_alphanumeric() || c == '+' || c == '-' || c == '.')
}
pub fn iris_equivalent(iri1: &str, iri2: &str) -> NormalizationResult<bool> {
let normalized1 = normalize_iri(iri1)?;
let normalized2 = normalize_iri(iri2)?;
Ok(normalized1.is_equivalent(&normalized2))
}
pub fn normalize_iri_cow(iri: &str) -> NormalizationResult<Cow<'_, str>> {
let normalized = normalize_iri(iri)?;
if normalized.as_str() == iri {
Ok(Cow::Borrowed(iri))
} else {
Ok(Cow::Owned(normalized.into_string()))
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_case_normalization() {
let iri = normalize_iri("HTTP://EXAMPLE.ORG/Path").expect("valid IRI");
assert_eq!(iri.as_str(), "http://example.org/Path");
}
#[test]
fn test_percent_encoding_normalization() {
let iri = normalize_iri("http://example.org/%7Euser").expect("valid IRI");
assert_eq!(iri.as_str(), "http://example.org/~user");
let iri = normalize_iri("http://example.org/%41%42%43").expect("valid IRI");
assert_eq!(iri.as_str(), "http://example.org/ABC");
let iri = normalize_iri("http://example.org/path%20with%20spaces").expect("valid IRI");
assert_eq!(iri.as_str(), "http://example.org/path%20with%20spaces");
}
#[test]
fn test_default_port_removal() {
let iri = normalize_iri("http://example.org:80/path").expect("valid IRI");
assert_eq!(iri.as_str(), "http://example.org/path");
let iri = normalize_iri("https://example.org:443/path").expect("valid IRI");
assert_eq!(iri.as_str(), "https://example.org/path");
let iri = normalize_iri("http://example.org:8080/path").expect("valid IRI");
assert_eq!(iri.as_str(), "http://example.org:8080/path");
}
#[test]
fn test_path_normalization() {
let iri = normalize_iri("http://example.org/a/./b/../c").expect("valid IRI");
assert_eq!(iri.as_str(), "http://example.org/a/c");
let iri = normalize_iri("http://example.org/./a/b").expect("valid IRI");
assert_eq!(iri.as_str(), "http://example.org/a/b");
let iri = normalize_iri("http://example.org/a/b/..").expect("valid IRI");
assert_eq!(iri.as_str(), "http://example.org/a");
}
#[test]
fn test_empty_path_normalization() {
let iri = normalize_iri("http://example.org").expect("valid IRI");
assert_eq!(iri.as_str(), "http://example.org/");
}
#[test]
fn test_query_and_fragment() {
let iri = normalize_iri("http://example.org/path?query=value#fragment").expect("valid IRI");
assert_eq!(iri.as_str(), "http://example.org/path?query=value#fragment");
let iri = normalize_iri("http://example.org/path?q=%41#%42").expect("valid IRI");
assert_eq!(iri.as_str(), "http://example.org/path?q=A#B");
}
#[test]
fn test_ipv6_address() {
let iri = normalize_iri("http://[2001:db8::1]/path").expect("valid IRI");
assert_eq!(iri.as_str(), "http://[2001:db8::1]/path");
let iri = normalize_iri("http://[2001:DB8::1]:8080/path").expect("valid IRI");
assert_eq!(iri.as_str(), "http://[2001:db8::1]:8080/path");
}
#[test]
fn test_userinfo() {
let iri = normalize_iri("http://user:pass@example.org/path").expect("valid IRI");
assert_eq!(iri.as_str(), "http://user:pass@example.org/path");
let iri = normalize_iri("http://%41%42%43@example.org/path").expect("valid IRI");
assert_eq!(iri.as_str(), "http://ABC@example.org/path");
}
#[test]
fn test_iris_equivalent() {
assert!(
iris_equivalent("HTTP://EXAMPLE.ORG/path", "http://example.org/path")
.expect("valid IRI")
);
assert!(
iris_equivalent("http://example.org:80/path", "http://example.org/path")
.expect("valid IRI")
);
assert!(
iris_equivalent("http://example.org/a/./b/../c", "http://example.org/a/c")
.expect("valid IRI")
);
assert!(
!iris_equivalent("http://example.org/path1", "http://example.org/path2")
.expect("valid IRI")
);
}
#[test]
fn test_complex_normalization() {
let iri = normalize_iri("HTTP://USER@EXAMPLE.ORG:80/A/./B/../C/%7Euser?Q=%41#%42")
.expect("valid IRI");
assert_eq!(iri.as_str(), "http://USER@example.org/A/C/~user?Q=A#B");
}
#[test]
fn test_non_http_schemes() {
let iri = normalize_iri("ftp://example.org:21/path").expect("valid IRI");
assert_eq!(iri.as_str(), "ftp://example.org/path");
let iri = normalize_iri("urn:isbn:0451450523").expect("valid IRI");
assert_eq!(iri.as_str(), "urn:isbn:0451450523");
}
#[test]
fn test_invalid_iri() {
assert!(normalize_iri("").is_err());
assert!(normalize_iri("not an iri").is_err());
assert!(normalize_iri("http://example.org/%ZZ").is_err());
}
#[test]
fn test_normalized_iri_methods() {
let iri1 = normalize_iri("http://example.org/path").expect("valid IRI");
let iri2 = normalize_iri("HTTP://EXAMPLE.ORG/path").expect("valid IRI");
assert_eq!(iri1.as_str(), "http://example.org/path");
assert!(iri1.is_equivalent(&iri2));
assert_eq!(iri1, iri2);
let cloned = iri1.clone();
assert_eq!(iri1, cloned);
}
#[test]
fn test_normalize_iri_cow() {
let iri = "http://example.org/path";
let result = normalize_iri_cow(iri).expect("valid IRI");
assert!(matches!(result, Cow::Borrowed(_)));
assert_eq!(result, iri);
let iri = "HTTP://EXAMPLE.ORG/path";
let result = normalize_iri_cow(iri).expect("valid IRI");
assert!(matches!(result, Cow::Owned(_)));
assert_eq!(result, "http://example.org/path");
}
#[test]
fn test_urn_normalization() {
let iri = normalize_iri("URN:ISBN:0451450523").expect("valid IRI");
assert_eq!(iri.as_str(), "urn:ISBN:0451450523");
}
#[test]
fn test_trailing_slash() {
let iri1 = normalize_iri("http://example.org/path/").expect("valid IRI");
let iri2 = normalize_iri("http://example.org/path").expect("valid IRI");
assert_ne!(iri1, iri2);
assert_eq!(iri1.as_str(), "http://example.org/path/");
assert_eq!(iri2.as_str(), "http://example.org/path");
}
}