fn is_unreserved(byte: u8) -> bool {
byte.is_ascii_alphanumeric() || matches!(byte, b'-' | b'.' | b'_' | b'~')
}
fn hex_val(b: u8) -> Option<u8> {
match b {
b'0'..=b'9' => Some(b - b'0'),
b'a'..=b'f' => Some(b - b'a' + 10),
b'A'..=b'F' => Some(b - b'A' + 10),
_ => None,
}
}
fn decode_unreserved_once(input: &str) -> (String, bool) {
let bytes = input.as_bytes();
let mut result = Vec::with_capacity(bytes.len());
let mut decoded_any = false;
let mut i = 0;
while i < bytes.len() {
if bytes[i] == b'%' && i + 2 < bytes.len() {
if let (Some(hi), Some(lo)) = (hex_val(bytes[i + 1]), hex_val(bytes[i + 2])) {
let decoded_byte = (hi << 4) | lo;
if is_unreserved(decoded_byte) {
result.push(decoded_byte);
decoded_any = true;
i += 3;
continue;
} else {
result.push(b'%');
result.push(bytes[i + 1].to_ascii_uppercase());
result.push(bytes[i + 2].to_ascii_uppercase());
i += 3;
continue;
}
}
result.push(bytes[i]);
i += 1;
} else {
result.push(bytes[i]);
i += 1;
}
}
(String::from_utf8_lossy(&result).into_owned(), decoded_any)
}
pub fn normalize_path(raw: &str) -> NormalizedComponent {
let mut current = raw.to_string();
let mut rounds = 0;
loop {
let (decoded, did_decode) = decode_unreserved_once(¤t);
current = decoded;
rounds += 1;
if !did_decode || rounds >= 3 {
break;
}
}
let double_encoded = detect_double_encoding(¤t);
NormalizedComponent {
raw: raw.to_string(),
normalized: current,
double_encoded,
rounds,
}
}
pub fn normalize_query(raw: &str) -> NormalizedComponent {
normalize_path(raw)
}
fn detect_double_encoding(s: &str) -> bool {
let bytes = s.as_bytes();
if bytes.len() < 5 {
return false;
}
let mut i = 0;
while i + 4 < bytes.len() {
if bytes[i] == b'%'
&& bytes[i + 1] == b'2'
&& bytes[i + 2] == b'5'
&& hex_val(bytes[i + 3]).is_some()
&& hex_val(bytes[i + 4]).is_some()
{
return true;
}
i += 1;
}
false
}
#[derive(Debug, Clone)]
pub struct NormalizedComponent {
pub raw: String,
pub normalized: String,
pub double_encoded: bool,
pub rounds: u32,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_unreserved_decoded() {
let result = normalize_path("%41");
assert_eq!(result.normalized, "A");
}
#[test]
fn test_reserved_preserved() {
let result = normalize_path("%2F");
assert_eq!(result.normalized, "%2F");
}
#[test]
fn test_reserved_at_preserved() {
let result = normalize_path("%40");
assert_eq!(result.normalized, "%40");
}
#[test]
fn test_reserved_colon_preserved() {
let result = normalize_path("%3A");
assert_eq!(result.normalized, "%3A");
}
#[test]
fn test_reserved_question_preserved() {
let result = normalize_path("%3F");
assert_eq!(result.normalized, "%3F");
}
#[test]
fn test_hex_case_normalized() {
let result = normalize_path("%2f");
assert_eq!(result.normalized, "%2F");
}
#[test]
fn test_double_encoding_detected() {
let result = normalize_path("%252F");
assert!(result.double_encoded);
}
#[test]
fn test_single_level_not_double_encoded() {
let result = normalize_path("%2F");
assert!(!result.double_encoded);
}
#[test]
fn test_mixed_encoding() {
let result = normalize_path("%41%2F");
assert_eq!(result.normalized, "A%2F");
}
#[test]
fn test_tilde_decoded() {
let result = normalize_path("%7E");
assert_eq!(result.normalized, "~");
}
#[test]
fn test_hyphen_decoded() {
let result = normalize_path("%2D");
assert_eq!(result.normalized, "-");
}
#[test]
fn test_dot_decoded() {
let result = normalize_path("%2E");
assert_eq!(result.normalized, ".");
}
#[test]
fn test_underscore_decoded() {
let result = normalize_path("%5F");
assert_eq!(result.normalized, "_");
}
#[test]
fn test_no_encoding() {
let result = normalize_path("/path/to/file");
assert_eq!(result.normalized, "/path/to/file");
assert_eq!(result.rounds, 1);
}
#[test]
fn test_invalid_percent_triplet() {
let result = normalize_path("%GG");
assert_eq!(result.normalized, "%GG");
}
#[test]
fn test_multiple_rounds() {
let result = normalize_path("%2541");
assert!(result.double_encoded);
}
}