index-core 1.0.0

Core document model and semantic types for Index.
Documentation
//! URL semantic types.

use std::fmt::{Display, Formatter};

use ::url::Url;

/// Supported URL schemes known to Index.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Scheme {
    /// HTTP URL.
    Http,
    /// HTTPS URL.
    Https,
    /// Local file URL or explicit local file path use.
    File,
    /// A syntactically valid scheme that is not first-class yet.
    Other(String),
}

impl Scheme {
    /// Parses a scheme string.
    #[must_use]
    pub fn parse(input: &str) -> Self {
        match input.to_ascii_lowercase().as_str() {
            "http" => Self::Http,
            "https" => Self::Https,
            "file" => Self::File,
            other => Self::Other(other.to_owned()),
        }
    }

    /// Returns whether the scheme is allowed by the initial security policy.
    #[must_use]
    pub const fn is_initially_allowed(&self) -> bool {
        matches!(self, Self::Http | Self::Https | Self::File)
    }
}

/// URL parse/validation errors.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum UrlError {
    /// The input was empty.
    Empty,
    /// The input contained ASCII whitespace.
    ContainsWhitespace,
    /// The input did not contain a URL scheme separator.
    MissingScheme,
    /// The scheme is not allowed by the current policy.
    DisallowedScheme(String),
    /// The URL parser rejected the input.
    Invalid(String),
    /// HTTP and HTTPS URLs require a host.
    MissingHost,
}

impl Display for UrlError {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::Empty => f.write_str("URL is empty"),
            Self::ContainsWhitespace => f.write_str("URL contains whitespace"),
            Self::MissingScheme => f.write_str("URL is missing a scheme"),
            Self::DisallowedScheme(scheme) => write!(f, "URL scheme is not allowed: {scheme}"),
            Self::Invalid(reason) => write!(f, "URL is invalid: {reason}"),
            Self::MissingHost => f.write_str("HTTP URL is missing a host"),
        }
    }
}

impl std::error::Error for UrlError {}

/// A validated URL accepted by Index.
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct IndexUrl {
    raw: String,
    scheme: String,
}

impl IndexUrl {
    /// Parses and validates a URL according to the initial Index policy.
    ///
    pub fn parse(input: impl AsRef<str>) -> Result<Self, UrlError> {
        let trimmed = input.as_ref().trim();
        if trimmed.is_empty() {
            return Err(UrlError::Empty);
        }
        if trimmed.chars().any(char::is_whitespace) {
            return Err(UrlError::ContainsWhitespace);
        }

        let Some((scheme, rest)) = trimmed.split_once(':') else {
            return Err(UrlError::MissingScheme);
        };

        let parsed_scheme = Scheme::parse(scheme);
        if !parsed_scheme.is_initially_allowed() {
            return Err(UrlError::DisallowedScheme(scheme.to_owned()));
        }
        if matches!(parsed_scheme, Scheme::Http | Scheme::Https)
            && (!rest.starts_with("//") || rest.starts_with("///"))
        {
            return Err(UrlError::MissingHost);
        }

        let mut parsed =
            Url::parse(trimmed).map_err(|error| UrlError::Invalid(error.to_string()))?;
        if matches!(parsed_scheme, Scheme::Http | Scheme::Https) && parsed.host_str().is_none() {
            return Err(UrlError::MissingHost);
        }
        parsed.set_fragment(None);

        Ok(Self {
            raw: parsed.to_string(),
            scheme: parsed.scheme().to_owned(),
        })
    }

    /// Returns the original normalized string.
    #[must_use]
    pub fn as_str(&self) -> &str {
        &self.raw
    }

    /// Returns the lowercased scheme.
    #[must_use]
    pub fn scheme(&self) -> &str {
        &self.scheme
    }

    /// Returns the origin used for per-site state when one can be derived.
    #[must_use]
    pub fn origin(&self) -> Option<Origin> {
        Origin::from_url(self)
    }

    /// Returns a deterministic filesystem-safe cache key.
    #[must_use]
    pub fn cache_key(&self) -> String {
        let mut key = String::with_capacity(self.raw.len());
        let mut previous_was_separator = false;
        for ch in self.raw.chars() {
            if ch.is_ascii_alphanumeric() {
                key.push(ch.to_ascii_lowercase());
                previous_was_separator = false;
            } else if !previous_was_separator {
                key.push('_');
                previous_was_separator = true;
            }
        }
        let trimmed = key.trim_matches('_');
        if trimmed.is_empty() {
            "url".to_owned()
        } else {
            trimmed.to_owned()
        }
    }
}

impl Display for IndexUrl {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        f.write_str(self.as_str())
    }
}

/// A normalized URL origin used for per-site state.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Origin(String);

impl Origin {
    /// Builds an origin from a normalized Index URL.
    #[must_use]
    pub fn from_url(url: &IndexUrl) -> Option<Self> {
        let parsed = Url::parse(url.as_str()).ok()?;
        match parsed.scheme() {
            "http" | "https" => {
                let host = parsed.host_str()?;
                let port = parsed
                    .port()
                    .map(|port| format!(":{port}"))
                    .unwrap_or_default();
                Some(Self(format!("{}://{}{}", parsed.scheme(), host, port)))
            }
            "file" => Some(Self("file://".to_owned())),
            _ => None,
        }
    }

    /// Parses a stored origin string.
    #[must_use]
    pub fn from_stored(input: impl Into<String>) -> Self {
        Self(input.into())
    }

    /// Returns the normalized origin string.
    #[must_use]
    pub fn as_str(&self) -> &str {
        &self.0
    }
}

impl Display for Origin {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        f.write_str(self.as_str())
    }
}

#[cfg(test)]
mod tests {
    use super::{IndexUrl, Origin, Scheme, UrlError};

    #[test]
    fn parses_https_url() {
        let url = IndexUrl::parse("https://example.com/docs");
        assert!(url.is_ok());
        assert_eq!(url.map(|u| u.scheme().to_owned()), Ok("https".to_owned()));
    }

    #[test]
    fn rejects_empty_url() {
        assert_eq!(IndexUrl::parse(""), Err(UrlError::Empty));
    }

    #[test]
    fn rejects_whitespace() {
        assert_eq!(
            IndexUrl::parse("https://example.com/a b"),
            Err(UrlError::ContainsWhitespace)
        );
    }

    #[test]
    fn rejects_disallowed_scheme() {
        assert_eq!(
            IndexUrl::parse("javascript:alert(1)"),
            Err(UrlError::DisallowedScheme("javascript".to_owned()))
        );
    }

    #[test]
    fn rejects_missing_scheme() {
        assert_eq!(
            IndexUrl::parse("example.com/path"),
            Err(UrlError::MissingScheme)
        );
    }

    #[test]
    fn supports_file_scheme_and_display_roundtrip() {
        let parsed = IndexUrl::parse("file:///tmp/example.txt");
        assert!(parsed.is_ok());
        if let Ok(url) = parsed {
            assert_eq!(url.scheme(), "file");
            assert_eq!(url.to_string(), "file:///tmp/example.txt");
            assert_eq!(url.as_str(), "file:///tmp/example.txt");
        }
    }

    #[test]
    fn scheme_parser_distinguishes_known_and_other_values() {
        assert_eq!(Scheme::parse("HTTP"), Scheme::Http);
        assert_eq!(Scheme::parse("https"), Scheme::Https);
        assert_eq!(Scheme::parse("file"), Scheme::File);
        assert_eq!(Scheme::parse("mailto"), Scheme::Other("mailto".to_owned()));
    }

    #[test]
    fn only_initial_allowlist_schemes_are_marked_allowed() {
        assert!(Scheme::Http.is_initially_allowed());
        assert!(Scheme::Https.is_initially_allowed());
        assert!(Scheme::File.is_initially_allowed());
        assert!(!Scheme::Other("ssh".to_owned()).is_initially_allowed());
    }

    #[test]
    fn normalizes_scheme_host_default_port_and_fragment() {
        let url = IndexUrl::parse("HTTP://EXAMPLE.COM:80/docs#part");
        assert_eq!(
            url.map(|url| url.to_string()),
            Ok("http://example.com/docs".to_owned())
        );
    }

    #[test]
    fn derives_http_origin() -> Result<(), Box<dyn std::error::Error>> {
        let url = IndexUrl::parse("https://example.com:8443/docs")?;
        assert_eq!(
            url.origin(),
            Some(Origin::from_stored("https://example.com:8443"))
        );
        Ok(())
    }

    #[test]
    fn cache_keys_are_normalized_and_fragment_independent() -> Result<(), Box<dyn std::error::Error>>
    {
        let first = IndexUrl::parse("https://EXAMPLE.com:443/docs?q=1#one")?;
        let second = IndexUrl::parse("https://example.com/docs?q=1#two")?;

        assert_eq!(first.as_str(), second.as_str());
        assert_eq!(first.cache_key(), second.cache_key());
        assert!(!first.cache_key().contains('/'));
        assert!(!first.cache_key().contains('?'));
        Ok(())
    }

    #[test]
    fn rejects_http_urls_without_hosts() {
        assert_eq!(IndexUrl::parse("https:///docs"), Err(UrlError::MissingHost));
    }

    #[test]
    fn rejects_parser_invalid_urls() {
        assert!(matches!(
            IndexUrl::parse("http://[::1"),
            Err(UrlError::Invalid(_))
        ));
    }

    #[test]
    fn derives_file_origin_and_displays_stored_origin() -> Result<(), Box<dyn std::error::Error>> {
        let url = IndexUrl::parse("file:///tmp/index.html")?;
        let origin = Origin::from_url(&url);

        assert_eq!(origin, Some(Origin::from_stored("file://")));
        assert_eq!(Origin::from_stored("file://").to_string(), "file://");
        Ok(())
    }

    #[test]
    fn cache_key_has_fallback_for_non_alphanumeric_urls() -> Result<(), Box<dyn std::error::Error>>
    {
        let url = IndexUrl::parse("file:///")?;
        assert_eq!(url.cache_key(), "file");
        Ok(())
    }
}