use std::{borrow::Cow, fmt::Write};
#[derive(Debug, thiserror::Error, PartialEq)]
pub enum NormalisationError {
#[error("Unexpected end of string after percent sign.")]
TruncatedPercent,
#[error("Non-hexadecimal digits after percent sign.")]
NonHex,
#[error("Invalid sequence after percent sign at index: {0}.")]
InvalidPercent(usize),
}
#[inline]
fn decode_hex_pair(bytes: &[u8], i: usize) -> Result<u8, NormalisationError> {
let (Some(&h), Some(&l)) = (bytes.get(i + 1), bytes.get(i + 2)) else {
return Err(NormalisationError::TruncatedPercent);
};
let (Some(high), Some(low)) = ((h as char).to_digit(16), (l as char).to_digit(16)) else {
return Err(NormalisationError::NonHex);
};
Ok(u8::try_from(high * 16 + low).expect("hex digits fit in u8"))
}
#[allow(clippy::missing_panics_doc)] pub fn normalise_percent_encoded(input: &str) -> Result<Cow<'_, str>, NormalisationError> {
let mut result = String::new();
let mut last_pos = 0;
let bytes = input.as_bytes();
let mut i = 0;
while let Some(¤t_byte) = bytes.get(i) {
if current_byte != b'%' {
i += 1;
continue;
}
let start = i;
let first_byte = decode_hex_pair(bytes, i)?;
i += 3;
let char_len = match first_byte {
0x00..=0x7F => 1, 0xC0..=0xDF => 2, 0xE0..=0xEF => 3, 0xF0..=0xF7 => 4, _ => return Err(NormalisationError::InvalidPercent(i)),
};
let mut char_bytes = [first_byte, 0, 0, 0];
for dest in char_bytes.iter_mut().skip(1).take(char_len - 1) {
if bytes.get(i) != Some(&b'%') {
return Err(NormalisationError::InvalidPercent(i));
}
*dest = decode_hex_pair(bytes, i)?;
i += 3;
}
let ch = std::str::from_utf8(char_bytes.get(..char_len).expect("char_len is 1-4"))
.map_err(|_| NormalisationError::InvalidPercent(i))?
.chars()
.next()
.expect("decoded string has exactly one char");
result.push_str(
input
.get(last_pos..start)
.expect("last_pos and start are valid indices"),
);
match ch {
':' | '/' | '?' | '#' | '[' | ']' | '@' | '!' | '$' | '&' | '\'' | '(' | ')' | '*'
| '+' | ',' | ';' | '=' => {
for byte in ch.to_string().as_bytes() {
result.push('%');
write!(result, "{byte:02X}").expect("write to String");
}
}
_ => result.push(ch),
}
last_pos = i;
}
if last_pos == 0 {
Ok(Cow::Borrowed(input))
} else {
result.push_str(input.get(last_pos..).expect("last_pos is valid index"));
Ok(Cow::Owned(result))
}
}
#[must_use]
pub fn strict_percent_encoded(input: &str) -> Cow<'_, str> {
let mut result = String::new();
let mut last_index = 0;
for (i, ch) in input.char_indices() {
match ch {
#[rustfmt::skip]
':' | '/' | '?' | '#' | '[' | ']' | '@' |
'!' | '$' | '&' | '\'' | '(' | ')' |
'*' | '+' | ',' | ';' | '=' |
'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '.' | '_' | '~' |
'%' => {} _ => {
result.push_str(&input[last_index..i]);
for byte in ch.to_string().as_bytes() {
result.push('%');
write!(result, "{byte:02X}").expect("Appending to string must succeed");
}
last_index = i + ch.len_utf8();
}
}
}
if result.is_empty() {
Cow::Borrowed(input)
} else {
result.push_str(&input[last_index..]);
Cow::Owned(result)
}
}
#[cfg(test)]
mod test {
use super::{NormalisationError, normalise_percent_encoded, strict_percent_encoded};
#[test]
fn normalise_percent_encoded_valid_percent_encoding() {
let input = "%41";
let expected = "A";
assert_eq!(normalise_percent_encoded(input).unwrap(), expected);
}
#[test]
fn normalise_percent_encoded_invalid_hex_characters() {
let input = "%4G";
let expected = NormalisationError::NonHex;
assert_eq!(normalise_percent_encoded(input).unwrap_err(), expected);
}
#[test]
fn normalise_percent_encoded_incomplete_percent_encoding() {
let input = "%4";
let expected = NormalisationError::TruncatedPercent;
assert_eq!(normalise_percent_encoded(input).unwrap_err(), expected);
}
#[test]
fn normalise_percent_encoded_trailing_percent() {
let input = "hello%";
let expected = NormalisationError::TruncatedPercent;
assert_eq!(normalise_percent_encoded(input).unwrap_err(), expected);
}
#[test]
fn normalise_percent_encoded_unencoded_reserved_character() {
let input = "hello/";
let expected = input;
assert_eq!(normalise_percent_encoded(input).unwrap(), expected);
}
#[test]
fn normalise_percent_encoded_reserved_character() {
let input = "hello%2f";
let expected = "hello%2F";
assert_eq!(normalise_percent_encoded(input).unwrap(), expected);
}
#[test]
fn normalise_percent_encoded_keep_at_sign() {
let input = "%40";
let expected = input;
assert_eq!(normalise_percent_encoded(input).unwrap(), expected);
}
#[test]
fn normalise_percent_encoded_empty_string() {
let input = "";
let expected = "";
assert_eq!(normalise_percent_encoded(input).unwrap(), expected);
}
#[test]
fn normalise_percent_encoded_multibyte_utf8() {
let input = "%CE%95%CF%80%CE%B1%CF%86%CE%AD%CF%82";
let expected = "Επαφές";
assert_eq!(normalise_percent_encoded(input).unwrap(), expected);
}
#[test]
fn normalise_percent_encoded_mixed_ascii_and_utf8() {
let input = "hello-%CE%95%CF%80%CE%B1%CF%86%CE%AD%CF%82";
let expected = "hello-Επαφές";
assert_eq!(normalise_percent_encoded(input).unwrap(), expected);
}
#[test]
fn normalise_percent_encoded_rejects_overlong_encoding() {
let input = "%C1%81";
assert!(normalise_percent_encoded(input).is_err());
}
#[test]
fn strict_percent_encoded_reserved_characters() {
let input = ":/?#[]@!$&'()*+,;=";
let expected = input;
assert_eq!(strict_percent_encoded(input), expected);
}
#[test]
fn strict_percent_encoded_unreserved_characters() {
let input = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._~";
let expected = input;
assert_eq!(strict_percent_encoded(input), expected);
}
#[test]
fn strict_percent_encoded_percent_encoded_characters() {
let input = "%20%2F%3F";
let expected = input;
assert_eq!(strict_percent_encoded(input), expected);
}
#[test]
fn strict_percent_encoded_multibyte_characters() {
let input = "こんにちは";
let expected = "%E3%81%93%E3%82%93%E3%81%AB%E3%81%A1%E3%81%AF";
assert_eq!(strict_percent_encoded(input), expected);
}
#[test]
fn strict_percent_encoded_german_special_characters() {
let input = "Grüße aus Köln!";
let expected = "Gr%C3%BC%C3%9Fe%20aus%20K%C3%B6ln!";
assert_eq!(strict_percent_encoded(input), expected);
}
#[test]
fn strict_percent_encoded_emoji() {
let input = "😀🔥";
let expected = "%F0%9F%98%80%F0%9F%94%A5";
assert_eq!(strict_percent_encoded(input), expected);
}
#[test]
fn strict_percent_encoded_mixed_characters() {
let input = "Hello:/World%20😀";
let expected = "Hello:/World%20%F0%9F%98%80";
assert_eq!(strict_percent_encoded(input), expected);
}
#[test]
fn strict_percent_encoded_tilde() {
let input = "~";
let expected = input;
assert_eq!(strict_percent_encoded(input), expected);
}
}