tiny_clean/
uri_encoder.rs

1use crate::common::{HEX_MASK, HEX_SHIFT, U_HEX, char_bucket, char_mask};
2
3/// 0111_1111_1111 --> highest 2x utf 8 bytes
4/// 0000_1000_0000 --> most sig. utf8 byte
5/// 0000_1100_0000 --> most sig. 2x utf8 byte
6/// 0000_1110_0000 --> most sig. 3x utf8 byte
7/// 0000_1111_0000 --> most sig. 4x utf8 byte
8/// 0000_0000_0110 --> byte shift
9/// 0000_0011_1111 --> mask
10const MAX_UTF8_2_BYTE: u32 = 0b_0111_1111_1111;
11const UTF8_BYTE_MSB: u32 = 0b_0000_1000_0000;
12const UTF8_2_BYTE_FIRST_MSB: u32 = 0b_0000_1100_0000;
13const UTF8_3_BYTE_FIRST_MSB: u32 = 0b_0000_1110_0000;
14const UTF8_4_BYTE_FIRST_MSB: u32 = 0b_0000_1111_0000;
15
16const UTF8_SHIFT: u32 = 0b_0000_0000_0110;
17const UTF8_MASK: u32 = 0b_0000_0011_1111;
18
19pub enum UriEncoderMode {
20    Component,
21    FullUri,
22}
23
24pub struct UriEncoder {
25    valid_masks: [u32; 4],
26}
27impl UriEncoder {
28    pub fn new(mode: UriEncoderMode) -> Self {
29        //  starting from '0' + 10 bits (aka 0-9)
30        let one_to_nine = ((1u32 << 10u32) - 1u32) << ('0' as u32 & 31u32);
31
32        //  starting from 'A' + 26 bits (aka A-Z)
33        let uppercase_a_z = ((1u32 << 26u32) - 1u32) << ('A' as u32 & 31u32);
34
35        //  starting from 'a' + 26 bits (aka a-z)
36        let lowercase_a_z = ((1u32 << 26u32) - 1u32) << ('a' as u32 & 31u32);
37
38        let uri_unreserved_bucket1 = one_to_nine | char_mask('-') | char_mask('.');
39        let uri_unreserved_bucket2 = uppercase_a_z | char_mask('_');
40        let uri_unreserved_bucket3 = lowercase_a_z | char_mask('~');
41
42        match mode {
43            UriEncoderMode::Component => {
44                let valid_masks = [
45                    0,
46                    uri_unreserved_bucket1,
47                    uri_unreserved_bucket2,
48                    uri_unreserved_bucket3,
49                ];
50                Self { valid_masks }
51            }
52            UriEncoderMode::FullUri => {
53                let reserved_chars1 = [
54                    '!', '#', '$', '?', '&', '(', ')', '*', '+', ',', ':', ';', '=', '/', '\'',
55                ];
56                let mut uri_reserved_bucket1: u32 = 0;
57                for reserved in reserved_chars1 {
58                    uri_reserved_bucket1 |= char_mask(reserved);
59                }
60
61                let reserved_chars2 = ['[', ']', '@'];
62                let mut uri_reserved_bucket2: u32 = 0;
63                for reserved in reserved_chars2 {
64                    uri_reserved_bucket2 |= char_mask(reserved);
65                }
66
67                let valid_masks = [
68                    0,
69                    uri_unreserved_bucket1 | uri_reserved_bucket1,
70                    uri_unreserved_bucket2 | uri_reserved_bucket2,
71                    uri_unreserved_bucket3,
72                ];
73                Self { valid_masks }
74            }
75        }
76    }
77
78    pub fn encode(&self, input: &str) -> String {
79        let starting_capacity = (u32::MAX / 2u32).min((input.len() * 9usize) as u32) as usize;
80        let mut result = String::with_capacity(starting_capacity);
81        for c in input.chars() {
82            if c as u32 <= 127u32 {
83                let bucket = char_bucket(c);
84                let mask = char_mask(c);
85
86                if (self.valid_masks[bucket] & mask) != 0 {
87                    result.push(c);
88                    continue;
89                } else {
90                    result.push('%');
91                    result.push(U_HEX[(c as u32 >> HEX_SHIFT) as usize]);
92                    result.push(U_HEX[(c as u32 & HEX_MASK) as usize]);
93                    continue;
94                }
95            } else if c as u32 <= MAX_UTF8_2_BYTE {
96                let b1 = UTF8_2_BYTE_FIRST_MSB | (c as u32 >> UTF8_SHIFT);
97                result.push('%');
98                result.push(U_HEX[(b1 >> HEX_SHIFT) as usize]);
99                result.push(U_HEX[(b1 & HEX_MASK) as usize]);
100
101                let b2 = UTF8_BYTE_MSB | (c as u32 & UTF8_MASK);
102                result.push('%');
103                result.push(U_HEX[(b2 >> HEX_SHIFT) as usize]);
104                result.push(U_HEX[(b2 & HEX_MASK) as usize]);
105            } else if c as u32 <= 0xFFFF {
106                let b1 = UTF8_3_BYTE_FIRST_MSB | (c as u32 >> (2 * UTF8_SHIFT));
107                result.push('%');
108                result.push(U_HEX[(b1 >> HEX_SHIFT) as usize]);
109                result.push(U_HEX[(b1 & HEX_MASK) as usize]);
110
111                let b2 = UTF8_BYTE_MSB | ((c as u32 >> UTF8_SHIFT) & UTF8_MASK);
112                result.push('%');
113                result.push(U_HEX[(b2 >> HEX_SHIFT) as usize]);
114                result.push(U_HEX[(b2 & HEX_MASK) as usize]);
115
116                let b3 = UTF8_BYTE_MSB | (c as u32 & UTF8_MASK);
117                result.push('%');
118                result.push(U_HEX[(b3 >> HEX_SHIFT) as usize]);
119                result.push(U_HEX[(b3 & HEX_MASK) as usize]);
120            } else {
121                let b1 = UTF8_4_BYTE_FIRST_MSB | (c as u32 >> (3 * UTF8_SHIFT));
122                result.push('%');
123                result.push(U_HEX[(b1 >> HEX_SHIFT) as usize]);
124                result.push(U_HEX[(b1 & HEX_MASK) as usize]);
125
126                let b2 = UTF8_BYTE_MSB | ((c as u32 >> (2 * UTF8_SHIFT)) & UTF8_MASK);
127                result.push('%');
128                result.push(U_HEX[(b2 >> HEX_SHIFT) as usize]);
129                result.push(U_HEX[(b2 & HEX_MASK) as usize]);
130
131                let b3 = UTF8_BYTE_MSB | ((c as u32 >> UTF8_SHIFT) & UTF8_MASK);
132                result.push('%');
133                result.push(U_HEX[(b3 >> HEX_SHIFT) as usize]);
134                result.push(U_HEX[(b3 & HEX_MASK) as usize]);
135
136                let b4 = UTF8_BYTE_MSB | (c as u32 & UTF8_MASK);
137                result.push('%');
138                result.push(U_HEX[(b4 >> HEX_SHIFT) as usize]);
139                result.push(U_HEX[(b4 & HEX_MASK) as usize]);
140            }
141        }
142        result.shrink_to_fit();
143        result
144    }
145}
146
147#[cfg(test)]
148mod test {
149    use crate::uri_encoder::{UriEncoder, UriEncoderMode};
150
151    fn shared_test_cases(encoder: &UriEncoder) {
152        assert_eq!("abcABC123", encoder.encode("abcABC123"));
153        assert_eq!("%20", encoder.encode(" "));
154        assert_eq!("%22", encoder.encode("\""));
155        assert_eq!("%25", encoder.encode("%"));
156        assert_eq!("%3C", encoder.encode("<"));
157        assert_eq!("%3E", encoder.encode(">"));
158        assert_eq!("%5C", encoder.encode("\\"));
159        assert_eq!("%5E", encoder.encode("^"));
160        assert_eq!("%60", encoder.encode("`"));
161        assert_eq!("%7B", encoder.encode("{"));
162        assert_eq!("%7C", encoder.encode("|"));
163        assert_eq!("%7D", encoder.encode("}"));
164        assert_eq!("%C2%A0", encoder.encode("\u{00a0}"));
165        assert_eq!("%E0%A0%80", encoder.encode("\u{0800}"));
166    }
167
168    #[test]
169    fn test_component_encode() {
170        let encoder = UriEncoder::new(UriEncoderMode::Component);
171        assert_eq!("%3A", encoder.encode(":"));
172        assert_eq!("%2F", encoder.encode("/"));
173        assert_eq!("%3F", encoder.encode("?"));
174        assert_eq!("%23", encoder.encode("#"));
175        assert_eq!("%5B", encoder.encode("["));
176        assert_eq!("%5D", encoder.encode("]"));
177        assert_eq!("%40", encoder.encode("@"));
178        assert_eq!("%21", encoder.encode("!"));
179        assert_eq!("%24", encoder.encode("$"));
180        assert_eq!("%26", encoder.encode("&"));
181        assert_eq!("%27", encoder.encode("'"));
182        assert_eq!("%28", encoder.encode("("));
183        assert_eq!("%29", encoder.encode(")"));
184        assert_eq!("%2A", encoder.encode("*"));
185        assert_eq!("%2B", encoder.encode("+"));
186        assert_eq!("%2C", encoder.encode(","));
187        assert_eq!("%3B", encoder.encode(";"));
188        assert_eq!("%3D", encoder.encode("="));
189        shared_test_cases(&encoder);
190    }
191
192    #[test]
193    fn test_full_uri_encode() {
194        let encoder = UriEncoder::new(UriEncoderMode::FullUri);
195        assert_eq!(
196            "http://www.owasp.org/index.php?foo=bar&baz#fragment",
197            encoder.encode("http://www.owasp.org/index.php?foo=bar&baz#fragment")
198        );
199        shared_test_cases(&encoder);
200    }
201}