Skip to main content

wafrift_encoding/encoding/
unicode.rs

1//! Unicode and HTML entity encoding strategies.
2use std::fmt::Write as _;
3
4/// Unicode encoding — each character becomes `\uXXXX`.
5///
6/// **Context**: ONLY safe when the target parser performs JSON/JavaScript decoding.
7/// Using this on raw HTTP parameters will send a literal backslash-u sequence.
8#[must_use]
9pub fn unicode_encode(payload: &str) -> String {
10    let mut out = String::with_capacity(payload.len() * 6);
11    for ch in payload.chars() {
12        let code = ch as u32;
13        if code > 0xFFFF {
14            // Non-BMP: emit surrogate pair (valid in JSON/JavaScript)
15            let surrogate_base = code - 0x1_0000;
16            let high = 0xD800 + ((surrogate_base >> 10) & 0x3FF);
17            let low = 0xDC00 + (surrogate_base & 0x3FF);
18            let _ = write!(&mut out, "\\u{:04X}\\u{:04X}", high, low);
19        } else {
20            let _ = write!(&mut out, "\\u{:04X}", code);
21        }
22    }
23    out
24}
25
26/// IIS/ASP percent Unicode encoding — each character becomes `%uXXXX`.
27///
28/// **Context**: ONLY safe on IIS/ASP classic parsers.
29#[must_use]
30pub fn iis_unicode_encode(payload: &str) -> String {
31    let mut out = String::with_capacity(payload.len() * 6);
32    for ch in payload.chars() {
33        let _ = write!(&mut out, "%u{:04X}", ch as u32);
34    }
35    out
36}
37
38/// JSON string encoding — wraps the payload in a JSON string with proper escaping.
39///
40/// **Context**: ONLY safe when the target parser performs JSON decoding.
41#[must_use]
42pub fn json_string_encode(payload: &str) -> String {
43    let mut out = String::with_capacity(payload.len() * 2 + 2);
44    out.push('"');
45    for ch in payload.chars() {
46        match ch {
47            '\\' => out.push_str("\\\\"),
48            '"' => out.push_str("\\\""),
49            '\u{0008}' => out.push_str("\\b"),
50            '\u{000C}' => out.push_str("\\f"),
51            '\n' => out.push_str("\\n"),
52            '\r' => out.push_str("\\r"),
53            '\t' => out.push_str("\\t"),
54            c if (c as u32) < 0x20 => {
55                let _ = write!(&mut out, "\\u{:04X}", c as u32);
56            }
57            c => out.push(c),
58        }
59    }
60    out.push('"');
61    out
62}
63
64/// HTML entity encoding — each character becomes `&#xXX;`.
65///
66/// **Context**: ONLY safe in HTML contexts where the browser decodes entities.
67#[must_use]
68pub fn html_entity_encode(payload: &str) -> String {
69    let mut out = String::with_capacity(payload.len() * 6);
70    for ch in payload.chars() {
71        let _ = write!(&mut out, "&#x{:X};", ch as u32);
72    }
73    out
74}
75
76/// HTML decimal entity encoding — each character becomes `&#DD;`.
77///
78/// **Context**: ONLY safe in HTML contexts where the browser decodes entities.
79#[must_use]
80pub fn html_entity_decimal_encode(payload: &str) -> String {
81    let mut out = String::with_capacity(payload.len() * 6);
82    for ch in payload.chars() {
83        let _ = write!(&mut out, "&#{};", ch as u32);
84    }
85    out
86}
87
88/// Fullwidth Unicode encoding — replaces ASCII with fullwidth equivalents.
89///
90/// Maps `!`–`~` (0x21–0x7E) to the fullwidth range `!`–`~` (0xFF01–0xFF5E).
91/// Spaces become ideographic space (U+3000).
92///
93/// **Bypass mechanism**: Many WAFs regex against ASCII keywords like `SELECT`,
94/// `UNION`, `<script>`, etc. Fullwidth characters are visually identical but
95/// have different codepoints, so regex fails. However, backends that perform
96/// Unicode NFKC normalization will convert them back to ASCII — meaning the
97/// payload executes while the WAF never saw it.
98///
99/// **Context**: Effective against WAFs in front of servers that normalize Unicode
100/// (Java/Spring, .NET, Python 3, Go, PostgreSQL, etc.).
101#[must_use]
102pub fn fullwidth_encode(payload: &str) -> String {
103    let mut out = String::with_capacity(payload.len() * 3);
104    for ch in payload.chars() {
105        let mapped = match ch {
106            ' ' => '\u{3000}', // Ideographic space
107            c if ('\x21'..='\x7e').contains(&c) => {
108                // Fullwidth offset: U+FF01 = U+0021 + 0xFEE0
109                char::from_u32(c as u32 + 0xFEE0).unwrap_or(c)
110            }
111            c => c,
112        };
113        out.push(mapped);
114    }
115    out
116}
117
118/// Homoglyph substitution — replaces select ASCII characters with visually
119/// identical Unicode characters from other scripts.
120///
121/// **Bypass mechanism**: WAFs match `'`, `"`, `<`, `>`, `=`, etc. as literal
122/// bytes. Unicode homoglyphs look identical in logs but aren't matched by
123/// byte-level regex. If the backend performs Unicode normalization (NFKC) or
124/// accepts these codepoints in SQL/HTML contexts, the payload executes.
125///
126/// **Context**: Effective against byte-level WAFs. Requires backend Unicode
127/// tolerance (common in modern frameworks).
128#[must_use]
129pub fn homoglyph_encode(payload: &str) -> String {
130    let mut out = String::with_capacity(payload.len() * 4);
131    for ch in payload.chars() {
132        let mapped = match ch {
133            // Quotes and delimiters
134            '\'' => '\u{2019}', // RIGHT SINGLE QUOTATION MARK (')
135            '"' => '\u{201D}',  // RIGHT DOUBLE QUOTATION MARK (")
136            // Comparison operators
137            '<' => '\u{FF1C}', // FULLWIDTH LESS-THAN SIGN (<)
138            '>' => '\u{FF1E}', // FULLWIDTH GREATER-THAN SIGN (>)
139            '=' => '\u{FF1D}', // FULLWIDTH EQUALS SIGN (=)
140            // Punctuation
141            '(' => '\u{FF08}', // FULLWIDTH LEFT PARENTHESIS (()
142            ')' => '\u{FF09}', // FULLWIDTH RIGHT PARENTHESIS ())
143            ';' => '\u{FF1B}', // FULLWIDTH SEMICOLON (;)
144            '-' => '\u{2010}', // HYPHEN (‐)
145            '/' => '\u{2215}', // DIVISION SLASH (∕)
146            // Keep letters and digits unchanged for readability
147            c => c,
148        };
149        out.push(mapped);
150    }
151    out
152}
153
154#[cfg(test)]
155mod tests {
156    use super::*;
157
158    #[test]
159    fn unicode_encode_basic() {
160        assert_eq!(unicode_encode("A"), "\\u0041");
161        assert_eq!(unicode_encode("AB"), "\\u0041\\u0042");
162    }
163
164    #[test]
165    fn unicode_encode_special_chars() {
166        let encoded = unicode_encode("' OR 1=1--");
167        assert!(encoded.contains("\\u0027")); // '
168        assert!(encoded.contains("\\u003D")); // =
169    }
170
171    #[test]
172    fn unicode_encode_unicode() {
173        let encoded = unicode_encode("日本語");
174        assert_eq!(encoded, "\\u65E5\\u672C\\u8A9E");
175    }
176
177    #[test]
178    fn iis_unicode_encode_basic() {
179        assert_eq!(iis_unicode_encode("A"), "%u0041");
180        assert_eq!(iis_unicode_encode("AB"), "%u0041%u0042");
181    }
182
183    #[test]
184    fn json_encode_basic() {
185        assert_eq!(json_string_encode("A"), "\"A\"");
186        assert_eq!(json_string_encode("A\\B"), "\"A\\\\B\"");
187        assert_eq!(json_string_encode("A\"B"), "\"A\\\"B\"");
188        assert_eq!(json_string_encode("A\nB"), "\"A\\nB\"");
189    }
190
191    #[test]
192    fn json_encode_control_chars() {
193        assert_eq!(json_string_encode("\x01"), "\"\\u0001\"");
194    }
195
196    #[test]
197    fn html_entity_encode_basic() {
198        assert_eq!(html_entity_encode("A"), "&#x41;");
199        assert_eq!(html_entity_encode("AB"), "&#x41;&#x42;");
200    }
201
202    #[test]
203    fn html_entity_encode_special_chars() {
204        let encoded = html_entity_encode("<script>");
205        assert_eq!(encoded, "&#x3C;&#x73;&#x63;&#x72;&#x69;&#x70;&#x74;&#x3E;");
206    }
207
208    #[test]
209    fn html_entity_decimal_encode_basic() {
210        assert_eq!(html_entity_decimal_encode("A"), "&#65;");
211        assert_eq!(html_entity_decimal_encode("<"), "&#60;");
212    }
213
214    #[test]
215    fn html_entity_encode_empty() {
216        assert_eq!(html_entity_encode(""), "");
217    }
218
219    #[test]
220    fn unicode_encode_empty() {
221        assert_eq!(unicode_encode(""), "");
222    }
223
224    // ── Fullwidth encoding tests ───────────────────────────────────────
225
226    #[test]
227    fn fullwidth_encode_sql_keywords() {
228        let encoded = fullwidth_encode("SELECT");
229        assert_eq!(encoded, "SELECT");
230        // Every ASCII letter should be in fullwidth range
231        for ch in encoded.chars() {
232            assert!(
233                ch as u32 >= 0xFF01,
234                "expected fullwidth char, got {ch} (U+{:04X})",
235                ch as u32
236            );
237        }
238    }
239
240    #[test]
241    fn fullwidth_encode_spaces() {
242        let encoded = fullwidth_encode("A B");
243        assert!(
244            encoded.contains('\u{3000}'),
245            "space should become ideographic space"
246        );
247    }
248
249    #[test]
250    fn fullwidth_encode_preserves_non_ascii() {
251        let encoded = fullwidth_encode("日本語");
252        assert_eq!(encoded, "日本語", "non-ASCII should pass through unchanged");
253    }
254
255    #[test]
256    fn fullwidth_encode_operators() {
257        let encoded = fullwidth_encode("1=1");
258        assert_eq!(encoded, "1=1");
259    }
260
261    #[test]
262    fn fullwidth_encode_sqli_payload() {
263        let encoded = fullwidth_encode("' OR 1=1--");
264        // Should contain fullwidth equivalents, not ASCII
265        assert!(!encoded.contains("OR"), "should not contain ASCII 'OR'");
266        assert!(encoded.contains("OR"), "should contain fullwidth 'OR'");
267    }
268
269    #[test]
270    fn fullwidth_encode_empty() {
271        assert_eq!(fullwidth_encode(""), "");
272    }
273
274    // ── Homoglyph encoding tests ───────────────────────────────────────
275
276    #[test]
277    fn homoglyph_replaces_quotes() {
278        let encoded = homoglyph_encode("' OR '1'='1");
279        assert!(
280            !encoded.contains('\''),
281            "ASCII single quote should be replaced"
282        );
283        assert!(
284            encoded.contains('\u{2019}'),
285            "should contain RIGHT SINGLE QUOTATION MARK"
286        );
287    }
288
289    #[test]
290    fn homoglyph_replaces_angle_brackets() {
291        let encoded = homoglyph_encode("<script>");
292        assert!(!encoded.contains('<'), "ASCII < should be replaced");
293        assert!(!encoded.contains('>'), "ASCII > should be replaced");
294        assert!(encoded.contains('\u{FF1C}'), "should contain fullwidth <");
295        assert!(encoded.contains('\u{FF1E}'), "should contain fullwidth >");
296    }
297
298    #[test]
299    fn homoglyph_replaces_equals() {
300        let encoded = homoglyph_encode("1=1");
301        assert!(!encoded.contains('='), "ASCII = should be replaced");
302        assert!(encoded.contains('\u{FF1D}'), "should contain fullwidth =");
303    }
304
305    #[test]
306    fn homoglyph_preserves_letters() {
307        let encoded = homoglyph_encode("SELECT");
308        assert_eq!(encoded, "SELECT", "letters should be preserved");
309    }
310
311    #[test]
312    fn homoglyph_encode_empty() {
313        assert_eq!(homoglyph_encode(""), "");
314    }
315
316    #[test]
317    fn homoglyph_replaces_parens() {
318        let encoded = homoglyph_encode("fn()");
319        assert!(encoded.contains('\u{FF08}'), "should contain fullwidth (");
320        assert!(encoded.contains('\u{FF09}'), "should contain fullwidth )");
321    }
322}