Skip to main content

wafrift_encoding/encoding/
unicode.rs

1//! Unicode and HTML entity encoding strategies.
2use std::fmt::Write as _;
3
4/// Unicode encoding — each character becomes `\uXXXX`.
5///
6/// **Context**: ONLY safe when the target parser performs JSON/JavaScript decoding.
7/// Using this on raw HTTP parameters will send a literal backslash-u sequence.
8#[must_use]
9pub fn unicode_encode(payload: &str) -> String {
10    let mut out = String::with_capacity(payload.len() * 6);
11    for ch in payload.chars() {
12        let _ = write!(&mut out, "\\u{:04X}", ch as u32);
13    }
14    out
15}
16
17/// IIS/ASP percent Unicode encoding — each character becomes `%uXXXX`.
18///
19/// **Context**: ONLY safe on IIS/ASP classic parsers.
20#[must_use]
21pub fn iis_unicode_encode(payload: &str) -> String {
22    let mut out = String::with_capacity(payload.len() * 6);
23    for ch in payload.chars() {
24        let _ = write!(&mut out, "%u{:04X}", ch as u32);
25    }
26    out
27}
28
29/// JSON string encoding — wraps the payload in a JSON string with proper escaping.
30///
31/// **Context**: ONLY safe when the target parser performs JSON decoding.
32#[must_use]
33pub fn json_string_encode(payload: &str) -> String {
34    let mut out = String::with_capacity(payload.len() * 2 + 2);
35    out.push('"');
36    for ch in payload.chars() {
37        match ch {
38            '\\' => out.push_str("\\\\"),
39            '"' => out.push_str("\\\""),
40            '\u{0008}' => out.push_str("\\b"),
41            '\u{000C}' => out.push_str("\\f"),
42            '\n' => out.push_str("\\n"),
43            '\r' => out.push_str("\\r"),
44            '\t' => out.push_str("\\t"),
45            c if (c as u32) < 0x20 => {
46                let _ = write!(&mut out, "\\u{:04X}", c as u32);
47            }
48            c => out.push(c),
49        }
50    }
51    out.push('"');
52    out
53}
54
55/// HTML entity encoding — each character becomes `&#xXX;`.
56///
57/// **Context**: ONLY safe in HTML contexts where the browser decodes entities.
58#[must_use]
59pub fn html_entity_encode(payload: &str) -> String {
60    let mut out = String::with_capacity(payload.len() * 6);
61    for ch in payload.chars() {
62        let _ = write!(&mut out, "&#x{:X};", ch as u32);
63    }
64    out
65}
66
67/// HTML decimal entity encoding — each character becomes `&#DD;`.
68///
69/// **Context**: ONLY safe in HTML contexts where the browser decodes entities.
70#[must_use]
71pub fn html_entity_decimal_encode(payload: &str) -> String {
72    let mut out = String::with_capacity(payload.len() * 6);
73    for ch in payload.chars() {
74        let _ = write!(&mut out, "&#{};", ch as u32);
75    }
76    out
77}
78
79/// Fullwidth Unicode encoding — replaces ASCII with fullwidth equivalents.
80///
81/// Maps `!`–`~` (0x21–0x7E) to the fullwidth range `!`–`~` (0xFF01–0xFF5E).
82/// Spaces become ideographic space (U+3000).
83///
84/// **Bypass mechanism**: Many WAFs regex against ASCII keywords like `SELECT`,
85/// `UNION`, `<script>`, etc. Fullwidth characters are visually identical but
86/// have different codepoints, so regex fails. However, backends that perform
87/// Unicode NFKC normalization will convert them back to ASCII — meaning the
88/// payload executes while the WAF never saw it.
89///
90/// **Context**: Effective against WAFs in front of servers that normalize Unicode
91/// (Java/Spring, .NET, Python 3, Go, PostgreSQL, etc.).
92#[must_use]
93pub fn fullwidth_encode(payload: &str) -> String {
94    let mut out = String::with_capacity(payload.len() * 3);
95    for ch in payload.chars() {
96        let mapped = match ch {
97            ' ' => '\u{3000}', // Ideographic space
98            c if ('\x21'..='\x7e').contains(&c) => {
99                // Fullwidth offset: U+FF01 = U+0021 + 0xFEE0
100                char::from_u32(c as u32 + 0xFEE0).unwrap_or(c)
101            }
102            c => c,
103        };
104        out.push(mapped);
105    }
106    out
107}
108
109/// Homoglyph substitution — replaces select ASCII characters with visually
110/// identical Unicode characters from other scripts.
111///
112/// **Bypass mechanism**: WAFs match `'`, `"`, `<`, `>`, `=`, etc. as literal
113/// bytes. Unicode homoglyphs look identical in logs but aren't matched by
114/// byte-level regex. If the backend performs Unicode normalization (NFKC) or
115/// accepts these codepoints in SQL/HTML contexts, the payload executes.
116///
117/// **Context**: Effective against byte-level WAFs. Requires backend Unicode
118/// tolerance (common in modern frameworks).
119#[must_use]
120pub fn homoglyph_encode(payload: &str) -> String {
121    let mut out = String::with_capacity(payload.len() * 4);
122    for ch in payload.chars() {
123        let mapped = match ch {
124            // Quotes and delimiters
125            '\'' => '\u{2019}', // RIGHT SINGLE QUOTATION MARK (')
126            '"' => '\u{201D}',  // RIGHT DOUBLE QUOTATION MARK (")
127            // Comparison operators
128            '<' => '\u{FF1C}', // FULLWIDTH LESS-THAN SIGN (<)
129            '>' => '\u{FF1E}', // FULLWIDTH GREATER-THAN SIGN (>)
130            '=' => '\u{FF1D}', // FULLWIDTH EQUALS SIGN (=)
131            // Punctuation
132            '(' => '\u{FF08}', // FULLWIDTH LEFT PARENTHESIS (()
133            ')' => '\u{FF09}', // FULLWIDTH RIGHT PARENTHESIS ())
134            ';' => '\u{FF1B}', // FULLWIDTH SEMICOLON (;)
135            '-' => '\u{2010}', // HYPHEN (‐)
136            '/' => '\u{2215}', // DIVISION SLASH (∕)
137            // Keep letters and digits unchanged for readability
138            c => c,
139        };
140        out.push(mapped);
141    }
142    out
143}
144
145#[cfg(test)]
146mod tests {
147    use super::*;
148
149    #[test]
150    fn unicode_encode_basic() {
151        assert_eq!(unicode_encode("A"), "\\u0041");
152        assert_eq!(unicode_encode("AB"), "\\u0041\\u0042");
153    }
154
155    #[test]
156    fn unicode_encode_special_chars() {
157        let encoded = unicode_encode("' OR 1=1--");
158        assert!(encoded.contains("\\u0027")); // '
159        assert!(encoded.contains("\\u003D")); // =
160    }
161
162    #[test]
163    fn unicode_encode_unicode() {
164        let encoded = unicode_encode("日本語");
165        assert_eq!(encoded, "\\u65E5\\u672C\\u8A9E");
166    }
167
168    #[test]
169    fn iis_unicode_encode_basic() {
170        assert_eq!(iis_unicode_encode("A"), "%u0041");
171        assert_eq!(iis_unicode_encode("AB"), "%u0041%u0042");
172    }
173
174    #[test]
175    fn json_encode_basic() {
176        assert_eq!(json_string_encode("A"), "\"A\"");
177        assert_eq!(json_string_encode("A\\B"), "\"A\\\\B\"");
178        assert_eq!(json_string_encode("A\"B"), "\"A\\\"B\"");
179        assert_eq!(json_string_encode("A\nB"), "\"A\\nB\"");
180    }
181
182    #[test]
183    fn json_encode_control_chars() {
184        assert_eq!(json_string_encode("\x01"), "\"\\u0001\"");
185    }
186
187    #[test]
188    fn html_entity_encode_basic() {
189        assert_eq!(html_entity_encode("A"), "&#x41;");
190        assert_eq!(html_entity_encode("AB"), "&#x41;&#x42;");
191    }
192
193    #[test]
194    fn html_entity_encode_special_chars() {
195        let encoded = html_entity_encode("<script>");
196        assert_eq!(encoded, "&#x3C;&#x73;&#x63;&#x72;&#x69;&#x70;&#x74;&#x3E;");
197    }
198
199    #[test]
200    fn html_entity_decimal_encode_basic() {
201        assert_eq!(html_entity_decimal_encode("A"), "&#65;");
202        assert_eq!(html_entity_decimal_encode("<"), "&#60;");
203    }
204
205    #[test]
206    fn html_entity_encode_empty() {
207        assert_eq!(html_entity_encode(""), "");
208    }
209
210    #[test]
211    fn unicode_encode_empty() {
212        assert_eq!(unicode_encode(""), "");
213    }
214
215    // ── Fullwidth encoding tests ───────────────────────────────────────
216
217    #[test]
218    fn fullwidth_encode_sql_keywords() {
219        let encoded = fullwidth_encode("SELECT");
220        assert_eq!(encoded, "SELECT");
221        // Every ASCII letter should be in fullwidth range
222        for ch in encoded.chars() {
223            assert!(
224                ch as u32 >= 0xFF01,
225                "expected fullwidth char, got {ch} (U+{:04X})",
226                ch as u32
227            );
228        }
229    }
230
231    #[test]
232    fn fullwidth_encode_spaces() {
233        let encoded = fullwidth_encode("A B");
234        assert!(
235            encoded.contains('\u{3000}'),
236            "space should become ideographic space"
237        );
238    }
239
240    #[test]
241    fn fullwidth_encode_preserves_non_ascii() {
242        let encoded = fullwidth_encode("日本語");
243        assert_eq!(encoded, "日本語", "non-ASCII should pass through unchanged");
244    }
245
246    #[test]
247    fn fullwidth_encode_operators() {
248        let encoded = fullwidth_encode("1=1");
249        assert_eq!(encoded, "1=1");
250    }
251
252    #[test]
253    fn fullwidth_encode_sqli_payload() {
254        let encoded = fullwidth_encode("' OR 1=1--");
255        // Should contain fullwidth equivalents, not ASCII
256        assert!(!encoded.contains("OR"), "should not contain ASCII 'OR'");
257        assert!(encoded.contains("OR"), "should contain fullwidth 'OR'");
258    }
259
260    #[test]
261    fn fullwidth_encode_empty() {
262        assert_eq!(fullwidth_encode(""), "");
263    }
264
265    // ── Homoglyph encoding tests ───────────────────────────────────────
266
267    #[test]
268    fn homoglyph_replaces_quotes() {
269        let encoded = homoglyph_encode("' OR '1'='1");
270        assert!(
271            !encoded.contains('\''),
272            "ASCII single quote should be replaced"
273        );
274        assert!(
275            encoded.contains('\u{2019}'),
276            "should contain RIGHT SINGLE QUOTATION MARK"
277        );
278    }
279
280    #[test]
281    fn homoglyph_replaces_angle_brackets() {
282        let encoded = homoglyph_encode("<script>");
283        assert!(!encoded.contains('<'), "ASCII < should be replaced");
284        assert!(!encoded.contains('>'), "ASCII > should be replaced");
285        assert!(encoded.contains('\u{FF1C}'), "should contain fullwidth <");
286        assert!(encoded.contains('\u{FF1E}'), "should contain fullwidth >");
287    }
288
289    #[test]
290    fn homoglyph_replaces_equals() {
291        let encoded = homoglyph_encode("1=1");
292        assert!(!encoded.contains('='), "ASCII = should be replaced");
293        assert!(encoded.contains('\u{FF1D}'), "should contain fullwidth =");
294    }
295
296    #[test]
297    fn homoglyph_preserves_letters() {
298        let encoded = homoglyph_encode("SELECT");
299        assert_eq!(encoded, "SELECT", "letters should be preserved");
300    }
301
302    #[test]
303    fn homoglyph_encode_empty() {
304        assert_eq!(homoglyph_encode(""), "");
305    }
306
307    #[test]
308    fn homoglyph_replaces_parens() {
309        let encoded = homoglyph_encode("fn()");
310        assert!(encoded.contains('\u{FF08}'), "should contain fullwidth (");
311        assert!(encoded.contains('\u{FF09}'), "should contain fullwidth )");
312    }
313}