Skip to main content

tirith_core/
normalize.rs

1// Per-component normalization: decode only unreserved characters (RFC 3986 ยง2.3).
2// Unreserved: A-Z, a-z, 0-9, '-', '.', '_', '~'
3
4/// Check if a byte value represents an unreserved character.
5fn is_unreserved(byte: u8) -> bool {
6    byte.is_ascii_alphanumeric() || matches!(byte, b'-' | b'.' | b'_' | b'~')
7}
8
9/// Decode a hex character to its value.
10fn hex_val(b: u8) -> Option<u8> {
11    match b {
12        b'0'..=b'9' => Some(b - b'0'),
13        b'a'..=b'f' => Some(b - b'a' + 10),
14        b'A'..=b'F' => Some(b - b'A' + 10),
15        _ => None,
16    }
17}
18
19/// Decode only unreserved percent-encoded characters in a string.
20/// Returns the normalized string and whether any unreserved chars were decoded.
21/// Hex digits in percent-triplets are always normalized to uppercase.
22fn decode_unreserved_once(input: &str) -> (String, bool) {
23    let bytes = input.as_bytes();
24    let mut result = Vec::with_capacity(bytes.len());
25    let mut decoded_any = false;
26    let mut i = 0;
27
28    while i < bytes.len() {
29        if bytes[i] == b'%' && i + 2 < bytes.len() {
30            if let (Some(hi), Some(lo)) = (hex_val(bytes[i + 1]), hex_val(bytes[i + 2])) {
31                let decoded_byte = (hi << 4) | lo;
32                if is_unreserved(decoded_byte) {
33                    result.push(decoded_byte);
34                    decoded_any = true;
35                    i += 3;
36                    continue;
37                } else {
38                    // Normalize hex to uppercase but keep encoded
39                    result.push(b'%');
40                    result.push(bytes[i + 1].to_ascii_uppercase());
41                    result.push(bytes[i + 2].to_ascii_uppercase());
42                    i += 3;
43                    continue;
44                }
45            }
46            // Invalid percent-triplet, leave as-is
47            result.push(bytes[i]);
48            i += 1;
49        } else {
50            result.push(bytes[i]);
51            i += 1;
52        }
53    }
54
55    (String::from_utf8_lossy(&result).into_owned(), decoded_any)
56}
57
58/// Normalize a URL path component (decode unreserved chars, up to 3 rounds).
59/// Returns (normalized, raw, detected_double_encoding).
60pub fn normalize_path(raw: &str) -> NormalizedComponent {
61    let mut current = raw.to_string();
62    let mut rounds = 0;
63
64    // Always run at least one pass (for hex case normalization),
65    // then continue if unreserved chars were decoded (up to 3 rounds).
66    loop {
67        let (decoded, did_decode) = decode_unreserved_once(&current);
68        current = decoded;
69        rounds += 1;
70        if !did_decode || rounds >= 3 {
71            break;
72        }
73    }
74
75    // Detect double-encoding: look for %25XX patterns in the final result
76    // This indicates a percent-encoded percent sign that decoded to %XX
77    let double_encoded = detect_double_encoding(&current);
78
79    NormalizedComponent {
80        raw: raw.to_string(),
81        normalized: current,
82        double_encoded,
83        rounds,
84    }
85}
86
87/// Normalize a query/fragment component (same treatment as path).
88pub fn normalize_query(raw: &str) -> NormalizedComponent {
89    normalize_path(raw)
90}
91
92/// Detect genuine double-encoding: %25XX patterns (percent-encoded percent sign).
93fn detect_double_encoding(s: &str) -> bool {
94    let bytes = s.as_bytes();
95    if bytes.len() < 5 {
96        return false;
97    }
98    let mut i = 0;
99    while i + 4 < bytes.len() {
100        if bytes[i] == b'%'
101            && bytes[i + 1] == b'2'
102            && bytes[i + 2] == b'5'
103            && hex_val(bytes[i + 3]).is_some()
104            && hex_val(bytes[i + 4]).is_some()
105        {
106            return true;
107        }
108        i += 1;
109    }
110    false
111}
112
113/// Result of normalization.
114#[derive(Debug, Clone)]
115pub struct NormalizedComponent {
116    pub raw: String,
117    pub normalized: String,
118    pub double_encoded: bool,
119    pub rounds: u32,
120}
121
122#[cfg(test)]
123mod tests {
124    use super::*;
125
126    #[test]
127    fn test_unreserved_decoded() {
128        // %41 = 'A' (unreserved) -> should be decoded
129        let result = normalize_path("%41");
130        assert_eq!(result.normalized, "A");
131    }
132
133    #[test]
134    fn test_reserved_preserved() {
135        // %2F = '/' (reserved) -> should stay encoded
136        let result = normalize_path("%2F");
137        assert_eq!(result.normalized, "%2F");
138    }
139
140    #[test]
141    fn test_reserved_at_preserved() {
142        // %40 = '@' (reserved) -> should stay encoded
143        let result = normalize_path("%40");
144        assert_eq!(result.normalized, "%40");
145    }
146
147    #[test]
148    fn test_reserved_colon_preserved() {
149        // %3A = ':' (reserved) -> should stay encoded
150        let result = normalize_path("%3A");
151        assert_eq!(result.normalized, "%3A");
152    }
153
154    #[test]
155    fn test_reserved_question_preserved() {
156        // %3F = '?' (reserved) -> should stay encoded
157        let result = normalize_path("%3F");
158        assert_eq!(result.normalized, "%3F");
159    }
160
161    #[test]
162    fn test_hex_case_normalized() {
163        // %2f (lowercase) -> %2F (uppercase, still reserved)
164        let result = normalize_path("%2f");
165        assert_eq!(result.normalized, "%2F");
166    }
167
168    #[test]
169    fn test_double_encoding_detected() {
170        // %252F decodes to %2F after one round (unreserved part of %25 = '%')
171        // Actually %25 is '%' which is NOT unreserved, so it stays as %25
172        // %252F stays as %252F -> but we detect the %25 pattern
173        let result = normalize_path("%252F");
174        assert!(result.double_encoded);
175    }
176
177    #[test]
178    fn test_single_level_not_double_encoded() {
179        // %2F is normal, not double-encoded
180        let result = normalize_path("%2F");
181        assert!(!result.double_encoded);
182    }
183
184    #[test]
185    fn test_mixed_encoding() {
186        // %41%2F -> A%2F (A decoded, / preserved)
187        let result = normalize_path("%41%2F");
188        assert_eq!(result.normalized, "A%2F");
189    }
190
191    #[test]
192    fn test_tilde_decoded() {
193        // %7E = '~' (unreserved) -> decoded
194        let result = normalize_path("%7E");
195        assert_eq!(result.normalized, "~");
196    }
197
198    #[test]
199    fn test_hyphen_decoded() {
200        // %2D = '-' (unreserved) -> decoded
201        let result = normalize_path("%2D");
202        assert_eq!(result.normalized, "-");
203    }
204
205    #[test]
206    fn test_dot_decoded() {
207        // %2E = '.' (unreserved) -> decoded
208        let result = normalize_path("%2E");
209        assert_eq!(result.normalized, ".");
210    }
211
212    #[test]
213    fn test_underscore_decoded() {
214        // %5F = '_' (unreserved) -> decoded
215        let result = normalize_path("%5F");
216        assert_eq!(result.normalized, "_");
217    }
218
219    #[test]
220    fn test_no_encoding() {
221        let result = normalize_path("/path/to/file");
222        assert_eq!(result.normalized, "/path/to/file");
223        // One pass always runs (for hex case normalization), even with no encodings
224        assert_eq!(result.rounds, 1);
225    }
226
227    #[test]
228    fn test_invalid_percent_triplet() {
229        // %GG is not valid hex -> left as-is
230        let result = normalize_path("%GG");
231        assert_eq!(result.normalized, "%GG");
232    }
233
234    #[test]
235    fn test_multiple_rounds() {
236        // %2541 -> round 1: %25 stays (not unreserved), 41 stays as part of %2541
237        // Actually %2541: %25 = '%' (not unreserved, stays), then '4', '1'
238        // So it stays %2541 but we detect double encoding
239        let result = normalize_path("%2541");
240        assert!(result.double_encoded);
241    }
242}