Skip to main content

wafrift_encoding/encoding/
structural.rs

1//! Structural encoding strategies — byte-level and framing manipulations.
2
3use base64::{Engine as _, engine::general_purpose};
4use std::io::Write as _;
5
6use crate::error::EncodeError;
7use wafrift_types::hash::{FNV_OFFSET_64, FNV_PRIME_64};
8
9/// Result of chunked transfer-encoding split.
10///
11/// This strategy is ONLY semantically correct when the body is sent as the body
12/// of an HTTP request with `Transfer-Encoding: chunked`.
13#[derive(Debug, Clone, PartialEq, Eq)]
14pub struct ChunkedBody {
15    /// The chunked-encoded body as raw bytes.
16    pub body: Vec<u8>,
17    /// Required headers that must accompany this body.
18    pub required_headers: Vec<(String, String)>,
19}
20
21/// Null byte injection — append `%00` to truncate strings in C-style parsers.
22///
23/// **Context**: `php`, `cgi` — only semantically correct for backends using
24/// C-style null-terminated string handling.
25pub fn null_byte_inject(payload: impl AsRef<[u8]>) -> Result<String, EncodeError> {
26    let payload = payload.as_ref();
27    let payload_str = std::str::from_utf8(payload).map_err(|_| EncodeError::InvalidUtf8)?;
28    if payload.contains(&b'.') {
29        Ok(format!("{payload_str}%00.jpg"))
30    } else {
31        Ok(format!("{payload_str}%00"))
32    }
33}
34
35/// Overlong UTF-8 encoding (2-byte) — represent ASCII non-alphanumeric as 2-byte sequences.
36///
37/// **Context**: `iis-6` — only works against specific legacy WAFs/frontends that
38/// normalize overlong sequences rather than rejecting them.
39pub fn overlong_utf8(payload: impl AsRef<[u8]>) -> Result<String, EncodeError> {
40    let text = std::str::from_utf8(payload.as_ref()).map_err(|_| EncodeError::InvalidUtf8)?;
41    Ok(text
42        .chars()
43        .map(|ch| {
44            if ch.is_ascii_alphanumeric() {
45                ch.to_string()
46            } else if ch.is_ascii() {
47                let byte = ch as u8;
48                format!("%{:02X}%{:02X}", 0xC0 | (byte >> 6), 0x80 | (byte & 0x3F))
49            } else {
50                ch.to_string()
51            }
52        })
53        .collect())
54}
55
56/// Extended overlong UTF-8 encoding (3-byte) — broader coverage with 3-byte sequences.
57///
58/// **Context**: `iis-6` — some WAFs reject 2-byte overlongs but accept 3-byte overlongs.
59///
60/// RFC 3629 3-byte form: `1110xxxx 10xxxxxx 10xxxxxx` encoding a
61/// 16-bit codepoint as `(x[0]<<12) | (x[1]<<6) | x[2]`. For an
62/// ASCII byte (codepoint ≤ 0x7F) the high nibble is zero so the
63/// lead byte is `0xE0`; the continuation bytes carry the codepoint
64/// split into two 6-bit halves: `(byte >> 6)` and `(byte & 0x3F)`.
65///
66/// Pre-fix this used `0x80 | byte` for the third byte, which
67/// silently produced INVALID continuation bytes for any input
68/// `byte >= 0x40` (since `0x80 | 0x40 = 0xC0`, above the valid
69/// continuation range 0x80–0xBF). That includes `@`, `[`, `\`,
70/// `]`, `^`, `_`, `` ` ``, `{`, `|`, `}`, `~` — all of which
71/// appear in real-world payloads (SQL backticks, path escapes,
72/// template-injection braces). Any conforming UTF-8 decoder
73/// rejected those sequences outright, so the encoder produced
74/// garbage rather than the intended bypass for ~10 punctuation
75/// characters.
76pub fn overlong_utf8_more(payload: impl AsRef<[u8]>) -> Result<String, EncodeError> {
77    let text = std::str::from_utf8(payload.as_ref()).map_err(|_| EncodeError::InvalidUtf8)?;
78    Ok(text
79        .chars()
80        .map(|ch| {
81            if ch.is_ascii_alphanumeric() {
82                ch.to_string()
83            } else if ch.is_ascii() {
84                let byte = ch as u8;
85                let cont1 = 0x80 | (byte >> 6);
86                let cont2 = 0x80 | (byte & 0x3F);
87                format!("%E0%{cont1:02X}%{cont2:02X}")
88            } else {
89                ch.to_string()
90            }
91        })
92        .collect())
93}
94
95/// Chunked transfer-encoding split — break payload across HTTP chunks.
96///
97/// **Context**: `http-request-body` — ONLY valid when sent with
98/// `Transfer-Encoding: chunked`.
99pub fn chunked_split(
100    payload: impl AsRef<[u8]>,
101    chunk_size: usize,
102) -> Result<ChunkedBody, EncodeError> {
103    let payload = payload.as_ref();
104    if payload.is_empty() {
105        return Ok(ChunkedBody {
106            body: Vec::new(),
107            required_headers: vec![("Transfer-Encoding".to_string(), "chunked".to_string())],
108        });
109    }
110    let chunk_size = chunk_size.max(1);
111    let mut result: Vec<u8> = Vec::with_capacity(payload.len() + 64);
112
113    for chunk in payload.chunks(chunk_size) {
114        let _ = write!(&mut result, "{:x}\r\n", chunk.len());
115        result.extend_from_slice(chunk);
116        result.extend_from_slice(b"\r\n");
117    }
118    result.extend_from_slice(b"0\r\n\r\n");
119
120    Ok(ChunkedBody {
121        body: result,
122        required_headers: vec![("Transfer-Encoding".to_string(), "chunked".to_string())],
123    })
124}
125
126/// HTTP parameter pollution — duplicate parameter with a benign first value.
127///
128/// Depending on the server framework, the last value wins (PHP, ASP.NET)
129/// while many WAFs only inspect the first parameter occurrence.
130pub fn parameter_pollute(payload: impl AsRef<[u8]>) -> Result<String, EncodeError> {
131    let payload = payload.as_ref();
132    let payload_str = std::str::from_utf8(payload).map_err(|_| EncodeError::InvalidUtf8)?;
133    if let Some(eq_pos) = payload.iter().position(|byte| *byte == b'=') {
134        let key = std::str::from_utf8(&payload[..eq_pos]).map_err(|_| EncodeError::InvalidUtf8)?;
135        Ok(format!("{key}=safe&{payload_str}"))
136    } else {
137        // Deterministic decoy: a plausible 8-letter junk parameter name
138        // derived from the payload via FNV-1a. Identical input ⇒
139        // identical output — a non-deterministic encoder cannot be
140        // regression-pinned and makes a successful bypass impossible to
141        // reproduce (the rest of the evasion pipeline, e.g. the equiv
142        // generator, is deterministic-seeded for exactly this reason).
143        let mut h: u64 = FNV_OFFSET_64;
144        for &b in payload {
145            h ^= u64::from(b);
146            h = h.wrapping_mul(FNV_PRIME_64);
147        }
148        let decoy: String = (0..8)
149            .map(|i| (b'a' + (((h >> (i * 8)) as u8) % 26)) as char)
150            .collect();
151        Ok(format!("{decoy}=1&{payload_str}"))
152    }
153}
154
155/// Base64 encoding — standard alphabet.
156pub fn base64_encode(payload: impl AsRef<[u8]>) -> String {
157    general_purpose::STANDARD.encode(payload)
158}
159
160/// Base64 URL-safe encoding — `-_` alphabet, no padding.
161pub fn base64_url_encode(payload: impl AsRef<[u8]>) -> String {
162    general_purpose::URL_SAFE_NO_PAD.encode(payload)
163}
164
165/// Hex encoding.
166pub fn hex_encode(payload: impl AsRef<[u8]>) -> String {
167    hex::encode(payload)
168}
169
170// UTF-7 (RFC 2152) codec moved to `wafrift_types::utf7` so `wafrift-grammar`
171// can reuse it for the `charset=utf-7` delivery shape WITHOUT depending on
172// this crate's heavy native deps (brotli/flate2). Re-exported here so every
173// existing `structural::utf7_encode` / `Strategy::Utf7Encode` caller and the
174// crate's public API are unchanged.
175pub use wafrift_types::utf7::{utf7_decode, utf7_encode};
176
177/// Gzip compression.
178///
179/// **Context**: `http-request-body` — ONLY valid with `Content-Encoding: gzip`.
180pub fn gzip_encode(payload: impl AsRef<[u8]>) -> Result<String, EncodeError> {
181    let payload = payload.as_ref();
182    let mut encoder = flate2::write::GzEncoder::new(Vec::new(), flate2::Compression::default());
183    encoder
184        .write_all(payload)
185        .map_err(|e| EncodeError::InvalidConfig(format!("gzip failed: {e}")))?;
186    let bytes = encoder
187        .finish()
188        .map_err(|e| EncodeError::InvalidConfig(format!("gzip failed: {e}")))?;
189    Ok(general_purpose::STANDARD.encode(bytes))
190}
191
192/// Deflate compression.
193///
194/// **Context**: `http-request-body` — ONLY valid with `Content-Encoding: deflate`.
195pub fn deflate_encode(payload: impl AsRef<[u8]>) -> Result<String, EncodeError> {
196    let payload = payload.as_ref();
197    let mut encoder =
198        flate2::write::DeflateEncoder::new(Vec::new(), flate2::Compression::default());
199    encoder
200        .write_all(payload)
201        .map_err(|e| EncodeError::InvalidConfig(format!("deflate failed: {e}")))?;
202    let bytes = encoder
203        .finish()
204        .map_err(|e| EncodeError::InvalidConfig(format!("deflate failed: {e}")))?;
205    Ok(general_purpose::STANDARD.encode(bytes))
206}
207
208#[cfg(test)]
209mod tests {
210    use super::*;
211
212    #[test]
213    fn null_byte_with_extension() {
214        assert_eq!(null_byte_inject("file.php").unwrap(), "file.php%00.jpg");
215    }
216
217    #[test]
218    fn null_byte_without_extension() {
219        assert_eq!(null_byte_inject("payload").unwrap(), "payload%00");
220    }
221
222    #[test]
223    fn overlong_utf8_slash() {
224        let result = overlong_utf8("/").unwrap();
225        assert_eq!(result, "%C0%AF");
226    }
227
228    #[test]
229    fn overlong_utf8_more_slash() {
230        let result = overlong_utf8_more("/").unwrap();
231        assert_eq!(result, "%E0%80%AF");
232    }
233
234    #[test]
235    fn overlong_utf8_more_punctuation_above_0x40_uses_valid_continuation_bytes() {
236        // Regression for the silent garbage bug: pre-fix every
237        // input byte >= 0x40 produced a 3rd continuation byte
238        // above 0xBF (`0x80 | 0x7B` = 0xFB), so the percent-
239        // decoded sequence was rejected by every lenient parser
240        // on the planet — bypass produced nothing.
241        //
242        // The output IS still overlong by RFC 3629 (`std::str::from_utf8`
243        // refuses it — that's the whole point of an overlong-encoding
244        // bypass), but the continuation bytes must be in the valid
245        // 0x80..=0xBF window so a lenient parser (the WAF / origin
246        // we're trying to confuse) actually decodes the bytes back to
247        // the original ASCII codepoint.
248        for ch in ['@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~'] {
249            let s = ch.to_string();
250            let encoded = overlong_utf8_more(&s).unwrap();
251            assert!(
252                encoded.starts_with("%E0%"),
253                "{ch:?} should use 3-byte form, got: {encoded}"
254            );
255            let bytes: Vec<u8> = encoded
256                .split('%')
257                .filter(|s| !s.is_empty())
258                .map(|s| u8::from_str_radix(s, 16).unwrap())
259                .collect();
260            assert_eq!(bytes.len(), 3, "expected 3 bytes for {ch:?}");
261            assert_eq!(bytes[0], 0xE0, "lead byte wrong for {ch:?}");
262            assert!(
263                (0x80..=0xBF).contains(&bytes[1]),
264                "{ch:?} 2nd byte 0x{:02X} outside valid continuation range",
265                bytes[1]
266            );
267            assert!(
268                (0x80..=0xBF).contains(&bytes[2]),
269                "{ch:?} 3rd byte 0x{:02X} outside valid continuation range",
270                bytes[2]
271            );
272            // Verify the bit-shifted codepoint matches the original.
273            // RFC 3629 decode: ((b1 & 0x0F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F)
274            // Lead nibble is 0 (we encoded ASCII), so the upper 12
275            // bits drop out and the codepoint equals
276            // ((b2 & 0x3F) << 6) | (b3 & 0x3F).
277            let codepoint = ((bytes[1] & 0x3F) as u32) << 6 | (bytes[2] & 0x3F) as u32;
278            assert_eq!(
279                codepoint, ch as u32,
280                "decoded codepoint 0x{codepoint:X} != original 0x{:X}",
281                ch as u32
282            );
283        }
284    }
285
286    #[test]
287    fn overlong_utf8_more_preserves_alphanumerics_verbatim() {
288        // Alphanumeric chars pass through unchanged — this is the
289        // existing fast-path that lets the WAF see "evil" payload
290        // markers while obfuscating the punctuation surrounding them.
291        assert_eq!(overlong_utf8_more("abc123").unwrap(), "abc123");
292    }
293
294    #[test]
295    fn chunked_split_produces_valid_chunks() {
296        let result = chunked_split("SELECT * FROM users", 3).unwrap();
297        let body = String::from_utf8(result.body.clone()).unwrap();
298        assert!(body.contains("\r\n"));
299        assert!(body.ends_with("0\r\n\r\n"));
300        assert_eq!(
301            result.required_headers,
302            vec![("Transfer-Encoding".to_string(), "chunked".to_string())]
303        );
304    }
305
306    #[test]
307    fn chunked_split_byte_lengths_correct() {
308        let payload = b"abc\x80\x81defgh";
309        let result = chunked_split(payload, 3).unwrap();
310        // Parse the raw bytes: each chunk is "size\r\ndata\r\n"
311        let mut i = 0;
312        let mut chunk_count = 0;
313        let expected_chunk_sizes = [3_usize, 3, 3, 1];
314        while i < result.body.len() {
315            // Find the \r\n after the size
316            let size_end = result.body[i..]
317                .windows(2)
318                .position(|w| w == b"\r\n")
319                .unwrap_or(result.body.len() - i)
320                + i;
321            let size_str = std::str::from_utf8(&result.body[i..size_end]).unwrap();
322            if size_str == "0" {
323                // Terminating chunk
324                break;
325            }
326            let size = usize::from_str_radix(size_str, 16).unwrap();
327            assert_eq!(size, expected_chunk_sizes[chunk_count]);
328            // Data starts after \r\n and ends after size bytes
329            let data_start = size_end + 2;
330            let data_end = data_start + size;
331            assert_eq!(
332                &result.body[data_start..data_end],
333                &payload[chunk_count * 3..chunk_count * 3 + size]
334            );
335            // Skip the trailing \r\n
336            i = data_end + 2;
337            chunk_count += 1;
338        }
339        assert_eq!(chunk_count, 4);
340    }
341
342    #[test]
343    fn chunked_split_empty() {
344        let result = chunked_split("", 3).unwrap();
345        assert!(result.body.is_empty());
346    }
347
348    #[test]
349    fn parameter_pollution_with_key_value() {
350        let result = parameter_pollute("user=' OR 1=1--").unwrap();
351        assert!(result.starts_with("user=safe&"));
352        assert!(result.contains("user=' OR 1=1--"));
353    }
354
355    #[test]
356    fn parameter_pollution_without_equals() {
357        let result = parameter_pollute("payload").unwrap();
358        assert!(result.ends_with("&payload"));
359        assert!(!result.contains("_wafrift_decoy"));
360        // The decoy is a deterministic 8-letter lowercase junk param.
361        let decoy = result
362            .strip_suffix("=1&payload")
363            .expect("decoy=1&payload shape");
364        assert_eq!(decoy.len(), 8, "decoy must be 8 chars: {result}");
365        assert!(
366            decoy.bytes().all(|b| b.is_ascii_lowercase()),
367            "decoy must be [a-z]{{8}}: {result}"
368        );
369        // Deterministic: identical payload ⇒ byte-identical output, and
370        // a different payload yields a different decoy.
371        assert_eq!(result, parameter_pollute("payload").unwrap());
372        assert_ne!(result, parameter_pollute("payloae").unwrap());
373    }
374
375    #[test]
376    fn base64_standard() {
377        assert_eq!(base64_encode("hello"), "aGVsbG8=");
378    }
379
380    #[test]
381    fn base64_url_safe() {
382        assert_eq!(base64_url_encode("hello+++"), "aGVsbG8rKys");
383    }
384
385    #[test]
386    fn hex_encode_basic() {
387        assert_eq!(hex_encode("ABC"), "414243");
388    }
389
390    #[test]
391    fn utf7_rfc2152_basic() {
392        // Direct chars pass through
393        assert_eq!(utf7_encode("Hello"), "Hello");
394        // Plus sign escaped
395        assert_eq!(utf7_encode("A+B"), "A+-B");
396        // Non-ASCII encoded
397        assert!(utf7_encode("日本語").starts_with('+'));
398    }
399
400    #[test]
401    fn utf7_rfc2152_decodeable() {
402        // A+IBNg- is the standard UTF-7 for 日本語
403        let encoded = utf7_encode("日本語");
404        assert!(encoded.contains('+'));
405        assert!(encoded.contains('-'));
406    }
407
408    #[test]
409    fn gzip_roundtrip() {
410        let original = b"SELECT * FROM users";
411        let encoded = gzip_encode(original).unwrap();
412        assert!(!encoded.is_empty());
413        // Verify it's valid base64
414        let decoded = general_purpose::STANDARD.decode(&encoded).unwrap();
415        let mut decoder = flate2::read::GzDecoder::new(&decoded[..]);
416        let mut decompressed = Vec::new();
417        std::io::Read::read_to_end(&mut decoder, &mut decompressed).unwrap();
418        assert_eq!(decompressed, original);
419    }
420
421    #[test]
422    fn deflate_roundtrip() {
423        let original = b"SELECT * FROM users";
424        let encoded = deflate_encode(original).unwrap();
425        assert!(!encoded.is_empty());
426        let decoded = general_purpose::STANDARD.decode(&encoded).unwrap();
427        let mut decoder = flate2::read::DeflateDecoder::new(&decoded[..]);
428        let mut decompressed = Vec::new();
429        std::io::Read::read_to_end(&mut decoder, &mut decompressed).unwrap();
430        assert_eq!(decompressed, original);
431    }
432}