tiny_clean/
uri_encoder.rs1use crate::common::{HEX_MASK, HEX_SHIFT, U_HEX, char_bucket, char_mask};
2
3const MAX_UTF8_2_BYTE: u32 = 0b_0111_1111_1111;
11const UTF8_BYTE_MSB: u32 = 0b_0000_1000_0000;
12const UTF8_2_BYTE_FIRST_MSB: u32 = 0b_0000_1100_0000;
13const UTF8_3_BYTE_FIRST_MSB: u32 = 0b_0000_1110_0000;
14const UTF8_4_BYTE_FIRST_MSB: u32 = 0b_0000_1111_0000;
15
16const UTF8_SHIFT: u32 = 0b_0000_0000_0110;
17const UTF8_MASK: u32 = 0b_0000_0011_1111;
18
19pub enum UriEncoderMode {
20 Component,
21 FullUri,
22}
23
24pub struct UriEncoder {
25 valid_masks: [u32; 4],
26}
27impl UriEncoder {
28 pub fn new(mode: UriEncoderMode) -> Self {
29 let one_to_nine = ((1u32 << 10u32) - 1u32) << ('0' as u32 & 31u32);
31
32 let uppercase_a_z = ((1u32 << 26u32) - 1u32) << ('A' as u32 & 31u32);
34
35 let lowercase_a_z = ((1u32 << 26u32) - 1u32) << ('a' as u32 & 31u32);
37
38 let uri_unreserved_bucket1 = one_to_nine | char_mask('-') | char_mask('.');
39 let uri_unreserved_bucket2 = uppercase_a_z | char_mask('_');
40 let uri_unreserved_bucket3 = lowercase_a_z | char_mask('~');
41
42 match mode {
43 UriEncoderMode::Component => {
44 let valid_masks = [
45 0,
46 uri_unreserved_bucket1,
47 uri_unreserved_bucket2,
48 uri_unreserved_bucket3,
49 ];
50 Self { valid_masks }
51 }
52 UriEncoderMode::FullUri => {
53 let reserved_chars1 = [
54 '!', '#', '$', '?', '&', '(', ')', '*', '+', ',', ':', ';', '=', '/', '\'',
55 ];
56 let mut uri_reserved_bucket1: u32 = 0;
57 for reserved in reserved_chars1 {
58 uri_reserved_bucket1 |= char_mask(reserved);
59 }
60
61 let reserved_chars2 = ['[', ']', '@'];
62 let mut uri_reserved_bucket2: u32 = 0;
63 for reserved in reserved_chars2 {
64 uri_reserved_bucket2 |= char_mask(reserved);
65 }
66
67 let valid_masks = [
68 0,
69 uri_unreserved_bucket1 | uri_reserved_bucket1,
70 uri_unreserved_bucket2 | uri_reserved_bucket2,
71 uri_unreserved_bucket3,
72 ];
73 Self { valid_masks }
74 }
75 }
76 }
77
78 pub fn encode(&self, input: &str) -> String {
79 let starting_capacity = (u32::MAX / 2u32).min((input.len() * 9usize) as u32) as usize;
80 let mut result = String::with_capacity(starting_capacity);
81 for c in input.chars() {
82 if c as u32 <= 127u32 {
83 let bucket = char_bucket(c);
84 let mask = char_mask(c);
85
86 if (self.valid_masks[bucket] & mask) != 0 {
87 result.push(c);
88 continue;
89 } else {
90 result.push('%');
91 result.push(U_HEX[(c as u32 >> HEX_SHIFT) as usize]);
92 result.push(U_HEX[(c as u32 & HEX_MASK) as usize]);
93 continue;
94 }
95 } else if c as u32 <= MAX_UTF8_2_BYTE {
96 let b1 = UTF8_2_BYTE_FIRST_MSB | (c as u32 >> UTF8_SHIFT);
97 result.push('%');
98 result.push(U_HEX[(b1 >> HEX_SHIFT) as usize]);
99 result.push(U_HEX[(b1 & HEX_MASK) as usize]);
100
101 let b2 = UTF8_BYTE_MSB | (c as u32 & UTF8_MASK);
102 result.push('%');
103 result.push(U_HEX[(b2 >> HEX_SHIFT) as usize]);
104 result.push(U_HEX[(b2 & HEX_MASK) as usize]);
105 } else if c as u32 <= 0xFFFF {
106 let b1 = UTF8_3_BYTE_FIRST_MSB | (c as u32 >> (2 * UTF8_SHIFT));
107 result.push('%');
108 result.push(U_HEX[(b1 >> HEX_SHIFT) as usize]);
109 result.push(U_HEX[(b1 & HEX_MASK) as usize]);
110
111 let b2 = UTF8_BYTE_MSB | ((c as u32 >> UTF8_SHIFT) & UTF8_MASK);
112 result.push('%');
113 result.push(U_HEX[(b2 >> HEX_SHIFT) as usize]);
114 result.push(U_HEX[(b2 & HEX_MASK) as usize]);
115
116 let b3 = UTF8_BYTE_MSB | (c as u32 & UTF8_MASK);
117 result.push('%');
118 result.push(U_HEX[(b3 >> HEX_SHIFT) as usize]);
119 result.push(U_HEX[(b3 & HEX_MASK) as usize]);
120 } else {
121 let b1 = UTF8_4_BYTE_FIRST_MSB | (c as u32 >> (3 * UTF8_SHIFT));
122 result.push('%');
123 result.push(U_HEX[(b1 >> HEX_SHIFT) as usize]);
124 result.push(U_HEX[(b1 & HEX_MASK) as usize]);
125
126 let b2 = UTF8_BYTE_MSB | ((c as u32 >> (2 * UTF8_SHIFT)) & UTF8_MASK);
127 result.push('%');
128 result.push(U_HEX[(b2 >> HEX_SHIFT) as usize]);
129 result.push(U_HEX[(b2 & HEX_MASK) as usize]);
130
131 let b3 = UTF8_BYTE_MSB | ((c as u32 >> UTF8_SHIFT) & UTF8_MASK);
132 result.push('%');
133 result.push(U_HEX[(b3 >> HEX_SHIFT) as usize]);
134 result.push(U_HEX[(b3 & HEX_MASK) as usize]);
135
136 let b4 = UTF8_BYTE_MSB | (c as u32 & UTF8_MASK);
137 result.push('%');
138 result.push(U_HEX[(b4 >> HEX_SHIFT) as usize]);
139 result.push(U_HEX[(b4 & HEX_MASK) as usize]);
140 }
141 }
142 result.shrink_to_fit();
143 result
144 }
145}
146
147#[cfg(test)]
148mod test {
149 use crate::uri_encoder::{UriEncoder, UriEncoderMode};
150
151 fn shared_test_cases(encoder: &UriEncoder) {
152 assert_eq!("abcABC123", encoder.encode("abcABC123"));
153 assert_eq!("%20", encoder.encode(" "));
154 assert_eq!("%22", encoder.encode("\""));
155 assert_eq!("%25", encoder.encode("%"));
156 assert_eq!("%3C", encoder.encode("<"));
157 assert_eq!("%3E", encoder.encode(">"));
158 assert_eq!("%5C", encoder.encode("\\"));
159 assert_eq!("%5E", encoder.encode("^"));
160 assert_eq!("%60", encoder.encode("`"));
161 assert_eq!("%7B", encoder.encode("{"));
162 assert_eq!("%7C", encoder.encode("|"));
163 assert_eq!("%7D", encoder.encode("}"));
164 assert_eq!("%C2%A0", encoder.encode("\u{00a0}"));
165 assert_eq!("%E0%A0%80", encoder.encode("\u{0800}"));
166 }
167
168 #[test]
169 fn test_component_encode() {
170 let encoder = UriEncoder::new(UriEncoderMode::Component);
171 assert_eq!("%3A", encoder.encode(":"));
172 assert_eq!("%2F", encoder.encode("/"));
173 assert_eq!("%3F", encoder.encode("?"));
174 assert_eq!("%23", encoder.encode("#"));
175 assert_eq!("%5B", encoder.encode("["));
176 assert_eq!("%5D", encoder.encode("]"));
177 assert_eq!("%40", encoder.encode("@"));
178 assert_eq!("%21", encoder.encode("!"));
179 assert_eq!("%24", encoder.encode("$"));
180 assert_eq!("%26", encoder.encode("&"));
181 assert_eq!("%27", encoder.encode("'"));
182 assert_eq!("%28", encoder.encode("("));
183 assert_eq!("%29", encoder.encode(")"));
184 assert_eq!("%2A", encoder.encode("*"));
185 assert_eq!("%2B", encoder.encode("+"));
186 assert_eq!("%2C", encoder.encode(","));
187 assert_eq!("%3B", encoder.encode(";"));
188 assert_eq!("%3D", encoder.encode("="));
189 shared_test_cases(&encoder);
190 }
191
192 #[test]
193 fn test_full_uri_encode() {
194 let encoder = UriEncoder::new(UriEncoderMode::FullUri);
195 assert_eq!(
196 "http://www.owasp.org/index.php?foo=bar&baz#fragment",
197 encoder.encode("http://www.owasp.org/index.php?foo=bar&baz#fragment")
198 );
199 shared_test_cases(&encoder);
200 }
201}