use std::fmt::{Display, Formatter};
use ::url::Url;
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Scheme {
Http,
Https,
File,
Other(String),
}
impl Scheme {
#[must_use]
pub fn parse(input: &str) -> Self {
match input.to_ascii_lowercase().as_str() {
"http" => Self::Http,
"https" => Self::Https,
"file" => Self::File,
other => Self::Other(other.to_owned()),
}
}
#[must_use]
pub const fn is_initially_allowed(&self) -> bool {
matches!(self, Self::Http | Self::Https | Self::File)
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum UrlError {
Empty,
ContainsWhitespace,
MissingScheme,
DisallowedScheme(String),
Invalid(String),
MissingHost,
}
impl Display for UrlError {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
Self::Empty => f.write_str("URL is empty"),
Self::ContainsWhitespace => f.write_str("URL contains whitespace"),
Self::MissingScheme => f.write_str("URL is missing a scheme"),
Self::DisallowedScheme(scheme) => write!(f, "URL scheme is not allowed: {scheme}"),
Self::Invalid(reason) => write!(f, "URL is invalid: {reason}"),
Self::MissingHost => f.write_str("HTTP URL is missing a host"),
}
}
}
impl std::error::Error for UrlError {}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct IndexUrl {
raw: String,
scheme: String,
}
impl IndexUrl {
pub fn parse(input: impl AsRef<str>) -> Result<Self, UrlError> {
let trimmed = input.as_ref().trim();
if trimmed.is_empty() {
return Err(UrlError::Empty);
}
if trimmed.chars().any(char::is_whitespace) {
return Err(UrlError::ContainsWhitespace);
}
let Some((scheme, rest)) = trimmed.split_once(':') else {
return Err(UrlError::MissingScheme);
};
let parsed_scheme = Scheme::parse(scheme);
if !parsed_scheme.is_initially_allowed() {
return Err(UrlError::DisallowedScheme(scheme.to_owned()));
}
if matches!(parsed_scheme, Scheme::Http | Scheme::Https)
&& (!rest.starts_with("//") || rest.starts_with("///"))
{
return Err(UrlError::MissingHost);
}
let mut parsed =
Url::parse(trimmed).map_err(|error| UrlError::Invalid(error.to_string()))?;
if matches!(parsed_scheme, Scheme::Http | Scheme::Https) && parsed.host_str().is_none() {
return Err(UrlError::MissingHost);
}
parsed.set_fragment(None);
Ok(Self {
raw: parsed.to_string(),
scheme: parsed.scheme().to_owned(),
})
}
#[must_use]
pub fn as_str(&self) -> &str {
&self.raw
}
#[must_use]
pub fn scheme(&self) -> &str {
&self.scheme
}
#[must_use]
pub fn origin(&self) -> Option<Origin> {
Origin::from_url(self)
}
#[must_use]
pub fn cache_key(&self) -> String {
let mut key = String::with_capacity(self.raw.len());
let mut previous_was_separator = false;
for ch in self.raw.chars() {
if ch.is_ascii_alphanumeric() {
key.push(ch.to_ascii_lowercase());
previous_was_separator = false;
} else if !previous_was_separator {
key.push('_');
previous_was_separator = true;
}
}
let trimmed = key.trim_matches('_');
if trimmed.is_empty() {
"url".to_owned()
} else {
trimmed.to_owned()
}
}
}
impl Display for IndexUrl {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
f.write_str(self.as_str())
}
}
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Origin(String);
impl Origin {
#[must_use]
pub fn from_url(url: &IndexUrl) -> Option<Self> {
let parsed = Url::parse(url.as_str()).ok()?;
match parsed.scheme() {
"http" | "https" => {
let host = parsed.host_str()?;
let port = parsed
.port()
.map(|port| format!(":{port}"))
.unwrap_or_default();
Some(Self(format!("{}://{}{}", parsed.scheme(), host, port)))
}
"file" => Some(Self("file://".to_owned())),
_ => None,
}
}
#[must_use]
pub fn from_stored(input: impl Into<String>) -> Self {
Self(input.into())
}
#[must_use]
pub fn as_str(&self) -> &str {
&self.0
}
}
impl Display for Origin {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
f.write_str(self.as_str())
}
}
#[cfg(test)]
mod tests {
use super::{IndexUrl, Origin, Scheme, UrlError};
#[test]
fn parses_https_url() {
let url = IndexUrl::parse("https://example.com/docs");
assert!(url.is_ok());
assert_eq!(url.map(|u| u.scheme().to_owned()), Ok("https".to_owned()));
}
#[test]
fn rejects_empty_url() {
assert_eq!(IndexUrl::parse(""), Err(UrlError::Empty));
}
#[test]
fn rejects_whitespace() {
assert_eq!(
IndexUrl::parse("https://example.com/a b"),
Err(UrlError::ContainsWhitespace)
);
}
#[test]
fn rejects_disallowed_scheme() {
assert_eq!(
IndexUrl::parse("javascript:alert(1)"),
Err(UrlError::DisallowedScheme("javascript".to_owned()))
);
}
#[test]
fn rejects_missing_scheme() {
assert_eq!(
IndexUrl::parse("example.com/path"),
Err(UrlError::MissingScheme)
);
}
#[test]
fn supports_file_scheme_and_display_roundtrip() {
let parsed = IndexUrl::parse("file:///tmp/example.txt");
assert!(parsed.is_ok());
if let Ok(url) = parsed {
assert_eq!(url.scheme(), "file");
assert_eq!(url.to_string(), "file:///tmp/example.txt");
assert_eq!(url.as_str(), "file:///tmp/example.txt");
}
}
#[test]
fn scheme_parser_distinguishes_known_and_other_values() {
assert_eq!(Scheme::parse("HTTP"), Scheme::Http);
assert_eq!(Scheme::parse("https"), Scheme::Https);
assert_eq!(Scheme::parse("file"), Scheme::File);
assert_eq!(Scheme::parse("mailto"), Scheme::Other("mailto".to_owned()));
}
#[test]
fn only_initial_allowlist_schemes_are_marked_allowed() {
assert!(Scheme::Http.is_initially_allowed());
assert!(Scheme::Https.is_initially_allowed());
assert!(Scheme::File.is_initially_allowed());
assert!(!Scheme::Other("ssh".to_owned()).is_initially_allowed());
}
#[test]
fn normalizes_scheme_host_default_port_and_fragment() {
let url = IndexUrl::parse("HTTP://EXAMPLE.COM:80/docs#part");
assert_eq!(
url.map(|url| url.to_string()),
Ok("http://example.com/docs".to_owned())
);
}
#[test]
fn derives_http_origin() -> Result<(), Box<dyn std::error::Error>> {
let url = IndexUrl::parse("https://example.com:8443/docs")?;
assert_eq!(
url.origin(),
Some(Origin::from_stored("https://example.com:8443"))
);
Ok(())
}
#[test]
fn cache_keys_are_normalized_and_fragment_independent() -> Result<(), Box<dyn std::error::Error>>
{
let first = IndexUrl::parse("https://EXAMPLE.com:443/docs?q=1#one")?;
let second = IndexUrl::parse("https://example.com/docs?q=1#two")?;
assert_eq!(first.as_str(), second.as_str());
assert_eq!(first.cache_key(), second.cache_key());
assert!(!first.cache_key().contains('/'));
assert!(!first.cache_key().contains('?'));
Ok(())
}
#[test]
fn rejects_http_urls_without_hosts() {
assert_eq!(IndexUrl::parse("https:///docs"), Err(UrlError::MissingHost));
}
#[test]
fn rejects_parser_invalid_urls() {
assert!(matches!(
IndexUrl::parse("http://[::1"),
Err(UrlError::Invalid(_))
));
}
#[test]
fn derives_file_origin_and_displays_stored_origin() -> Result<(), Box<dyn std::error::Error>> {
let url = IndexUrl::parse("file:///tmp/index.html")?;
let origin = Origin::from_url(&url);
assert_eq!(origin, Some(Origin::from_stored("file://")));
assert_eq!(Origin::from_stored("file://").to_string(), "file://");
Ok(())
}
#[test]
fn cache_key_has_fallback_for_non_alphanumeric_urls() -> Result<(), Box<dyn std::error::Error>>
{
let url = IndexUrl::parse("file:///")?;
assert_eq!(url.cache_key(), "file");
Ok(())
}
}