libdav 0.10.3 - Docs.rs

// Copyright 2024 Hugo Osvaldo Barrera
//
// SPDX-License-Identifier: ISC

//! Functions for encoding and decoding percent-encoded path components.
//!
//! libdav follows the same convention, and so should dependant software:
//!
//! - An `Uri` has its path component percent-encoded. The `Uri` type enforces validation of this.
//!   Additionally, all `Uri` instances produced by this library have their component normalised with
//!   [`strict_percent_encoded`].
//! - `href` strings shall have only reserved characters percent-encoded. This makes them better
//!   suitable for display, and for usage in filenames. These are frequently encoded into XML, so
//!   do not require escaping non-reserved characters. Handling of XML entities is done internally
//!   by libdav; consumers need not concern themselves with this.
//!
//! Hrefs should be treated as opaque strings. These MAY be percent-encoded, but keep in mind that
//! decoding them and re-encoding them may not yield the same result. From [RFC3986, section
//! 2.2](https://www.rfc-editor.org/rfc/rfc3986#section-2.2):
//!
//! > URIs that differ in the replacement of a reserved character with its corresponding
//! > percent-encoded octet are not equivalent.  Percent- encoding a reserved character, or
//! > decoding a percent-encoded octet that corresponds to a reserved character, will change how
//! > the URI is interpreted by most applications.  Thus, characters in the reserved set are
//! > protected from normalization and are therefore safe to be used by scheme-specific and
//! > producer-specific algorithms for delimiting data subcomponents within a URI.
//!
//! See [this short article](https://whynothugo.nl/journal/2024/12/27/urls-and-percent-encoding/)
//! for more details.

use std::{borrow::Cow, fmt::Write};

/// Returned when invalid input is provided to [`normalise_percent_encoded`].
#[derive(Debug, thiserror::Error, PartialEq)]
pub enum NormalisationError {
    /// Unexpected end of string after percent sign.
    #[error("Unexpected end of string after percent sign.")]
    TruncatedPercent,
    /// Non-hexadecimal digits after percent sign.
    #[error("Non-hexadecimal digits after percent sign.")]
    NonHex,
    /// Invalid sequence after percent sign.
    #[error("Invalid sequence after percent sign at index: {0}.")]
    InvalidPercent(usize),
}

/// Decode a percent-encoded byte pair `%XX` at position `i` in the byte slice.
///
/// # Errors
///
/// Returns an error if the sequence is truncated or contains invalid hex digits.
#[inline]
fn decode_hex_pair(bytes: &[u8], i: usize) -> Result<u8, NormalisationError> {
    let (Some(&h), Some(&l)) = (bytes.get(i + 1), bytes.get(i + 2)) else {
        return Err(NormalisationError::TruncatedPercent);
    };
    let (Some(high), Some(low)) = ((h as char).to_digit(16), (l as char).to_digit(16)) else {
        return Err(NormalisationError::NonHex);
    };
    Ok(u8::try_from(high * 16 + low).expect("hex digits fit in u8"))
}

/// Normalise a percent encoded path.
///
/// Reserved characters shall remain percent encoded, but their hexadecimal representation
/// normalised to uppercase. All other characters shall be decoded.
///
/// Percent-encoded sequences represent UTF-8 bytes. Multi-byte UTF-8 sequences are properly
/// decoded into Unicode characters before deciding whether to keep them encoded.
///
/// # Errors
///
/// Returns [`NormalisationError`] if the input string contains invalid percent-encoded data.
#[allow(clippy::missing_panics_doc)] // Internal expects are provably infallible.
pub fn normalise_percent_encoded(input: &str) -> Result<Cow<'_, str>, NormalisationError> {
    let mut result = String::new();
    let mut last_pos = 0;
    let bytes = input.as_bytes();
    let mut i = 0;

    while let Some(&current_byte) = bytes.get(i) {
        if current_byte != b'%' {
            i += 1;
            continue;
        }

        let start = i;

        // Decode the first byte to determine UTF-8 character length.
        let first_byte = decode_hex_pair(bytes, i)?;
        i += 3;
        let char_len = match first_byte {
            0x00..=0x7F => 1, // ASCII: 0xxxxxxx
            0xC0..=0xDF => 2, // 2-byte: 110xxxxx
            0xE0..=0xEF => 3, // 3-byte: 1110xxxx
            0xF0..=0xF7 => 4, // 4-byte: 11110xxx
            _ => return Err(NormalisationError::InvalidPercent(i)),
        };

        // Decode remaining bytes of this UTF-8 character.
        let mut char_bytes = [first_byte, 0, 0, 0];
        for dest in char_bytes.iter_mut().skip(1).take(char_len - 1) {
            if bytes.get(i) != Some(&b'%') {
                return Err(NormalisationError::InvalidPercent(i));
            }
            *dest = decode_hex_pair(bytes, i)?;
            i += 3;
        }

        // Decode the complete UTF-8 character.
        let ch = std::str::from_utf8(char_bytes.get(..char_len).expect("char_len is 1-4"))
            .map_err(|_| NormalisationError::InvalidPercent(i))?
            .chars()
            .next()
            .expect("decoded string has exactly one char");

        result.push_str(
            input
                .get(last_pos..start)
                .expect("last_pos and start are valid indices"),
        );
        match ch {
            ':' | '/' | '?' | '#' | '[' | ']' | '@' | '!' | '$' | '&' | '\'' | '(' | ')' | '*'
            | '+' | ',' | ';' | '=' => {
                // Reserved: keep encoded (uppercase)
                for byte in ch.to_string().as_bytes() {
                    result.push('%');
                    write!(result, "{byte:02X}").expect("write to String");
                }
            }
            _ => result.push(ch),
        }

        last_pos = i;
    }

    if last_pos == 0 {
        Ok(Cow::Borrowed(input))
    } else {
        result.push_str(input.get(last_pos..).expect("last_pos is valid index"));
        Ok(Cow::Owned(result))
    }
}

/// Percent-encode most characters in a path.
///
/// The input string may contain percent-encoded characters. All reserved characters which are
/// percent-encoded shall be left untouched. All unreserved characters remain intact. All other
/// non-reserved characters shall be percent-encoded.
///
/// ```ignore
/// reserved    = gen-delims / sub-delims
/// gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
/// sub-delims  = "!" / "$" / "&" / "'" / "(" / ")"
///              / "*" / "+" / "," / ";" / "="
///
/// unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
/// ```
///
/// The resulting string is not display-friendly, but it suitable for use as path in an HTTP
/// request.
///
/// Calling this function with an input where ANY non-reserved character has already been escape
/// will produce a double-encoded output. This output would point to a resource different than the
/// original input.
#[must_use]
pub fn strict_percent_encoded(input: &str) -> Cow<'_, str> {
    let mut result = String::new();
    let mut last_index = 0;

    for (i, ch) in input.char_indices() {
        match ch {
            #[rustfmt::skip]
            ':' | '/' | '?' | '#' | '[' | ']' | '@' |
            '!' | '$' | '&' | '\'' | '(' | ')' |
            '*' | '+' | ',' | ';' | '=' |
            'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '.' | '_' | '~' |
            '%' => {} // No need to encode.
            _ => {
                result.push_str(&input[last_index..i]);
                for byte in ch.to_string().as_bytes() {
                    result.push('%');
                    write!(result, "{byte:02X}").expect("Appending to string must succeed");
                }
                last_index = i + ch.len_utf8();
            }
        }
    }

    if result.is_empty() {
        Cow::Borrowed(input)
    } else {
        result.push_str(&input[last_index..]);
        Cow::Owned(result)
    }
}

#[cfg(test)]
mod test {
    use super::{NormalisationError, normalise_percent_encoded, strict_percent_encoded};

    // Tests for normalise_percent_encoded

    #[test]
    fn normalise_percent_encoded_valid_percent_encoding() {
        let input = "%41";
        let expected = "A";
        assert_eq!(normalise_percent_encoded(input).unwrap(), expected);
    }

    #[test]
    fn normalise_percent_encoded_invalid_hex_characters() {
        let input = "%4G";
        let expected = NormalisationError::NonHex;
        assert_eq!(normalise_percent_encoded(input).unwrap_err(), expected);
    }

    #[test]
    fn normalise_percent_encoded_incomplete_percent_encoding() {
        let input = "%4";
        let expected = NormalisationError::TruncatedPercent;
        assert_eq!(normalise_percent_encoded(input).unwrap_err(), expected);
    }

    #[test]
    fn normalise_percent_encoded_trailing_percent() {
        let input = "hello%";
        let expected = NormalisationError::TruncatedPercent;
        assert_eq!(normalise_percent_encoded(input).unwrap_err(), expected);
    }

    #[test]
    fn normalise_percent_encoded_unencoded_reserved_character() {
        let input = "hello/";
        let expected = input;
        assert_eq!(normalise_percent_encoded(input).unwrap(), expected);
    }

    #[test]
    fn normalise_percent_encoded_reserved_character() {
        let input = "hello%2f";
        let expected = "hello%2F";
        assert_eq!(normalise_percent_encoded(input).unwrap(), expected);
    }

    #[test]
    fn normalise_percent_encoded_keep_at_sign() {
        let input = "%40";
        let expected = input;
        assert_eq!(normalise_percent_encoded(input).unwrap(), expected);
    }

    #[test]
    fn normalise_percent_encoded_empty_string() {
        let input = "";
        let expected = "";
        assert_eq!(normalise_percent_encoded(input).unwrap(), expected);
    }

    #[test]
    fn normalise_percent_encoded_multibyte_utf8() {
        // Greek "Επαφές" - each character is 2 bytes in UTF-8
        // Ε (U+0395) = 0xCE 0x95
        // π (U+03C0) = 0xCF 0x80
        // α (U+03B1) = 0xCE 0xB1
        // φ (U+03C6) = 0xCF 0x86
        // έ (U+03AD) = 0xCE 0xAD
        // ς (U+03C2) = 0xCF 0x82
        let input = "%CE%95%CF%80%CE%B1%CF%86%CE%AD%CF%82";
        let expected = "Επαφές";
        assert_eq!(normalise_percent_encoded(input).unwrap(), expected);
    }

    #[test]
    fn normalise_percent_encoded_mixed_ascii_and_utf8() {
        // "hello-Επαφές" where Greek is percent-encoded
        let input = "hello-%CE%95%CF%80%CE%B1%CF%86%CE%AD%CF%82";
        let expected = "hello-Επαφές";
        assert_eq!(normalise_percent_encoded(input).unwrap(), expected);
    }

    #[test]
    fn normalise_percent_encoded_rejects_overlong_encoding() {
        // %C1%81 is an overlong encoding of 'A' (U+0041).
        // Valid UTF-8 requires ASCII to be encoded as single byte.
        let input = "%C1%81";
        assert!(normalise_percent_encoded(input).is_err());
    }

    // Tests for strict_percent_encoded

    #[test]
    fn strict_percent_encoded_reserved_characters() {
        let input = ":/?#[]@!$&'()*+,;=";
        let expected = input;
        assert_eq!(strict_percent_encoded(input), expected);
    }

    #[test]
    fn strict_percent_encoded_unreserved_characters() {
        let input = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._~";
        let expected = input;
        assert_eq!(strict_percent_encoded(input), expected);
    }

    #[test]
    fn strict_percent_encoded_percent_encoded_characters() {
        let input = "%20%2F%3F";
        let expected = input;
        assert_eq!(strict_percent_encoded(input), expected);
    }

    #[test]
    fn strict_percent_encoded_multibyte_characters() {
        let input = "こんにちは";
        let expected = "%E3%81%93%E3%82%93%E3%81%AB%E3%81%A1%E3%81%AF";
        assert_eq!(strict_percent_encoded(input), expected);
    }

    #[test]
    fn strict_percent_encoded_german_special_characters() {
        let input = "Grüße aus Köln!";
        let expected = "Gr%C3%BC%C3%9Fe%20aus%20K%C3%B6ln!";
        assert_eq!(strict_percent_encoded(input), expected);
    }

    #[test]
    fn strict_percent_encoded_emoji() {
        let input = "😀🔥";
        let expected = "%F0%9F%98%80%F0%9F%94%A5";
        assert_eq!(strict_percent_encoded(input), expected);
    }

    #[test]
    fn strict_percent_encoded_mixed_characters() {
        let input = "Hello:/World%20😀";
        let expected = "Hello:/World%20%F0%9F%98%80";
        assert_eq!(strict_percent_encoded(input), expected);
    }

    #[test]
    fn strict_percent_encoded_tilde() {
        let input = "~";
        let expected = input;
        assert_eq!(strict_percent_encoded(input), expected);
    }
}